Skip to content

Commit b554701

Browse files
committed
cut it down
1 parent e25f888 commit b554701

File tree

4 files changed

+119
-132
lines changed

4 files changed

+119
-132
lines changed

Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/iceberg/src/spec/manifest/mod.rs

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ mod tests {
157157
use std::fs;
158158
use std::sync::Arc;
159159

160+
use arrow_array::StringArray;
160161
use tempfile::TempDir;
161162

162163
use super::*;
@@ -1088,4 +1089,122 @@ mod tests {
10881089
assert!(!partitions[2].clone().contains_null);
10891090
assert_eq!(partitions[2].clone().contains_nan, Some(false));
10901091
}
1092+
1093+
#[test]
1094+
fn test_data_file_serialization() {
1095+
// Create a simple schema
1096+
let schema = Schema::builder()
1097+
.with_schema_id(1)
1098+
.with_identifier_field_ids(vec![1])
1099+
.with_fields(vec![
1100+
crate::spec::NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long))
1101+
.into(),
1102+
crate::spec::NestedField::required(
1103+
2,
1104+
"name",
1105+
Type::Primitive(PrimitiveType::String),
1106+
)
1107+
.into(),
1108+
])
1109+
.build()
1110+
.unwrap();
1111+
1112+
// Create a partition spec
1113+
let partition_spec = PartitionSpec::builder(schema.clone())
1114+
.with_spec_id(1)
1115+
.add_partition_field("id", "id_partition", crate::spec::Transform::Identity)
1116+
.unwrap()
1117+
.build()
1118+
.unwrap();
1119+
1120+
// Get partition type from the partition spec
1121+
let partition_type = partition_spec.partition_type(&schema).unwrap();
1122+
1123+
// Set version flag
1124+
let is_version_1 = false;
1125+
1126+
// Create a vector of DataFile objects
1127+
let data_files = vec![
1128+
DataFileBuilder::default()
1129+
.content(crate::spec::DataContentType::Data)
1130+
.file_format(DataFileFormat::Parquet)
1131+
.file_path("path/to/file1.parquet".to_string())
1132+
.file_size_in_bytes(1024)
1133+
.record_count(100)
1134+
.partition_spec_id(1)
1135+
.partition(Struct::empty())
1136+
.column_sizes(HashMap::from([(1, 512), (2, 512)]))
1137+
.value_counts(HashMap::from([(1, 100), (2, 100)]))
1138+
.null_value_counts(HashMap::from([(1, 0), (2, 0)]))
1139+
.build()
1140+
.unwrap(),
1141+
DataFileBuilder::default()
1142+
.content(crate::spec::DataContentType::Data)
1143+
.file_format(DataFileFormat::Parquet)
1144+
.file_path("path/to/file2.parquet".to_string())
1145+
.file_size_in_bytes(2048)
1146+
.record_count(200)
1147+
.partition_spec_id(1)
1148+
.partition(Struct::empty())
1149+
.column_sizes(HashMap::from([(1, 1024), (2, 1024)]))
1150+
.value_counts(HashMap::from([(1, 200), (2, 200)]))
1151+
.null_value_counts(HashMap::from([(1, 10), (2, 5)]))
1152+
.build()
1153+
.unwrap(),
1154+
];
1155+
1156+
// Serialize the DataFile objects
1157+
let serialized_files = data_files
1158+
.into_iter()
1159+
.map(|f| {
1160+
let json = serialize_data_file_to_json(f, &partition_type, is_version_1).unwrap();
1161+
println!("Test serialized data file: {}", json);
1162+
json
1163+
})
1164+
.collect::<Vec<String>>();
1165+
1166+
// Verify we have the expected number of serialized files
1167+
assert_eq!(serialized_files.len(), 2);
1168+
1169+
// Verify each serialized file contains expected data
1170+
for json in &serialized_files {
1171+
assert!(json.contains("path/to/file"));
1172+
assert!(json.contains("parquet"));
1173+
assert!(json.contains("record_count"));
1174+
assert!(json.contains("file_size_in_bytes"));
1175+
}
1176+
1177+
// Convert Vec<String> to StringArray and print it
1178+
let string_array = StringArray::from(serialized_files.clone());
1179+
println!("StringArray: {:?}", string_array);
1180+
1181+
// Now deserialize the JSON strings back into DataFile objects
1182+
println!("\nDeserializing back to DataFile objects:");
1183+
let deserialized_files: Vec<DataFile> = serialized_files
1184+
.into_iter()
1185+
.map(|json| {
1186+
let data_file = deserialize_data_file_from_json(
1187+
&json,
1188+
partition_spec.spec_id(),
1189+
&partition_type,
1190+
&schema,
1191+
)
1192+
.unwrap();
1193+
1194+
println!("Deserialized DataFile: {:?}", data_file);
1195+
data_file
1196+
})
1197+
.collect();
1198+
1199+
// Verify we have the expected number of deserialized files
1200+
assert_eq!(deserialized_files.len(), 2);
1201+
1202+
// Verify the deserialized files have the expected properties
1203+
for file in &deserialized_files {
1204+
assert_eq!(file.content_type(), crate::spec::DataContentType::Data);
1205+
assert_eq!(file.file_format(), DataFileFormat::Parquet);
1206+
assert!(file.file_path().contains("path/to/file"));
1207+
assert!(file.record_count() == 100 || file.record_count() == 200);
1208+
}
1209+
}
10911210
}

crates/integrations/datafusion/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ futures = { workspace = true }
3636
iceberg = { workspace = true }
3737
parquet = { workspace = true }
3838
tokio = { workspace = true }
39-
serde_json = { workspace = true }
4039
uuid = { workspace = true }
4140

4241
[dev-dependencies]

crates/integrations/datafusion/src/physical_plan/write.rs

Lines changed: 0 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -228,133 +228,3 @@ impl ExecutionPlan for IcebergWriteExec {
228228
)))
229229
}
230230
}
231-
232-
#[cfg(test)]
233-
mod tests {
234-
use std::collections::HashMap;
235-
236-
use datafusion::arrow::array::StringArray;
237-
use iceberg::spec::{
238-
DataFile, DataFileBuilder, DataFileFormat, PartitionSpec, PrimitiveType, Schema, Struct,
239-
Type, deserialize_data_file_from_json, serialize_data_file_to_json,
240-
};
241-
242-
// todo move this to DataFileSerde?
243-
#[test]
244-
fn test_data_file_serialization() {
245-
// Create a simple schema
246-
let schema = Schema::builder()
247-
.with_schema_id(1)
248-
.with_identifier_field_ids(vec![1])
249-
.with_fields(vec![
250-
iceberg::spec::NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long))
251-
.into(),
252-
iceberg::spec::NestedField::required(
253-
2,
254-
"name",
255-
Type::Primitive(PrimitiveType::String),
256-
)
257-
.into(),
258-
])
259-
.build()
260-
.unwrap();
261-
262-
// Create a partition spec
263-
let partition_spec = PartitionSpec::builder(schema.clone())
264-
.with_spec_id(1)
265-
.add_partition_field("id", "id_partition", iceberg::spec::Transform::Identity)
266-
.unwrap()
267-
.build()
268-
.unwrap();
269-
270-
// Get partition type from the partition spec
271-
let partition_type = partition_spec.partition_type(&schema).unwrap();
272-
273-
// Set version flag
274-
let is_version_1 = false;
275-
276-
// Create a vector of DataFile objects
277-
let data_files = vec![
278-
DataFileBuilder::default()
279-
.content(iceberg::spec::DataContentType::Data)
280-
.file_format(DataFileFormat::Parquet)
281-
.file_path("path/to/file1.parquet".to_string())
282-
.file_size_in_bytes(1024)
283-
.record_count(100)
284-
.partition_spec_id(1)
285-
.partition(Struct::empty())
286-
.column_sizes(HashMap::from([(1, 512), (2, 512)]))
287-
.value_counts(HashMap::from([(1, 100), (2, 100)]))
288-
.null_value_counts(HashMap::from([(1, 0), (2, 0)]))
289-
.build()
290-
.unwrap(),
291-
DataFileBuilder::default()
292-
.content(iceberg::spec::DataContentType::Data)
293-
.file_format(DataFileFormat::Parquet)
294-
.file_path("path/to/file2.parquet".to_string())
295-
.file_size_in_bytes(2048)
296-
.record_count(200)
297-
.partition_spec_id(1)
298-
.partition(Struct::empty())
299-
.column_sizes(HashMap::from([(1, 1024), (2, 1024)]))
300-
.value_counts(HashMap::from([(1, 200), (2, 200)]))
301-
.null_value_counts(HashMap::from([(1, 10), (2, 5)]))
302-
.build()
303-
.unwrap(),
304-
];
305-
306-
// Serialize the DataFile objects
307-
let serialized_files = data_files
308-
.into_iter()
309-
.map(|f| {
310-
let json = serialize_data_file_to_json(f, &partition_type, is_version_1).unwrap();
311-
println!("Test serialized data file: {}", json);
312-
json
313-
})
314-
.collect::<Vec<String>>();
315-
316-
// Verify we have the expected number of serialized files
317-
assert_eq!(serialized_files.len(), 2);
318-
319-
// Verify each serialized file contains expected data
320-
for json in &serialized_files {
321-
assert!(json.contains("path/to/file"));
322-
assert!(json.contains("parquet"));
323-
assert!(json.contains("record_count"));
324-
assert!(json.contains("file_size_in_bytes"));
325-
}
326-
327-
// Convert Vec<String> to StringArray and print it
328-
let string_array = StringArray::from(serialized_files.clone());
329-
println!("StringArray: {:?}", string_array);
330-
331-
// Now deserialize the JSON strings back into DataFile objects
332-
println!("\nDeserializing back to DataFile objects:");
333-
let deserialized_files: Vec<DataFile> = serialized_files
334-
.into_iter()
335-
.map(|json| {
336-
let data_file = deserialize_data_file_from_json(
337-
&json,
338-
partition_spec.spec_id(),
339-
&partition_type,
340-
&schema,
341-
)
342-
.unwrap();
343-
344-
println!("Deserialized DataFile: {:?}", data_file);
345-
data_file
346-
})
347-
.collect();
348-
349-
// Verify we have the expected number of deserialized files
350-
assert_eq!(deserialized_files.len(), 2);
351-
352-
// Verify the deserialized files have the expected properties
353-
for file in &deserialized_files {
354-
assert_eq!(file.content_type(), iceberg::spec::DataContentType::Data);
355-
assert_eq!(file.file_format(), DataFileFormat::Parquet);
356-
assert!(file.file_path().contains("path/to/file"));
357-
assert!(file.record_count() == 100 || file.record_count() == 200);
358-
}
359-
}
360-
}

0 commit comments

Comments
 (0)