Skip to content

Commit 8dd4a22

Browse files
authored
Read ManifestList V1 with V2 projection. (#1482)
## Which issue does this PR close? - Closes #1471 ## What changes are included in this PR? On ManifestList data files in v1, this sets the default content-type to DATA (1). ## Are these changes tested?
1 parent 55e181e commit 8dd4a22

File tree

3 files changed

+56
-17
lines changed

3 files changed

+56
-17
lines changed

crates/iceberg/src/avro/schema.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,20 +81,25 @@ impl SchemaVisitor for SchemaToAvroSchema {
8181
field_schema = avro_optional(field_schema)?;
8282
}
8383

84+
let default = if let Some(literal) = &field.initial_default {
85+
Some(literal.clone().try_into_json(&field.field_type)?)
86+
} else if !field.required {
87+
Some(Value::Null)
88+
} else {
89+
None
90+
};
91+
8492
let mut avro_record_field = AvroRecordField {
8593
name: field.name.clone(),
8694
schema: field_schema,
8795
order: RecordFieldOrder::Ignore,
8896
position: 0,
8997
doc: field.doc.clone(),
9098
aliases: None,
91-
default: None,
99+
default,
92100
custom_attributes: Default::default(),
93101
};
94102

95-
if !field.required {
96-
avro_record_field.default = Some(Value::Null);
97-
}
98103
avro_record_field.custom_attributes.insert(
99104
FIELD_ID_PROP.to_string(),
100105
Value::Number(Number::from(field.id)),

crates/iceberg/src/spec/manifest/_serde.rs

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -330,9 +330,8 @@ mod tests {
330330
assert_eq!(ret, expected_ret, "Negative i64 entry should be ignored!");
331331
}
332332

333-
#[tokio::test]
334-
async fn test_data_file_serialize_deserialize() {
335-
let schema = Arc::new(
333+
fn schema() -> Arc<Schema> {
334+
Arc::new(
336335
Schema::builder()
337336
.with_fields(vec![
338337
Arc::new(NestedField::optional(
@@ -353,8 +352,11 @@ mod tests {
353352
])
354353
.build()
355354
.unwrap(),
356-
);
357-
let data_files = vec![DataFile {
355+
)
356+
}
357+
358+
fn data_files() -> Vec<DataFile> {
359+
vec![DataFile {
358360
content: DataContentType::Data,
359361
file_path: "s3://testbucket/iceberg_data/iceberg_ctl/iceberg_db/iceberg_tbl/data/00000-7-45268d71-54eb-476c-b42c-942d880c04a1-00001.parquet".to_string(),
360362
file_format: DataFileFormat::Parquet,
@@ -376,7 +378,13 @@ mod tests {
376378
referenced_data_file: None,
377379
content_offset: None,
378380
content_size_in_bytes: None,
379-
}];
381+
}]
382+
}
383+
384+
#[tokio::test]
385+
async fn test_data_file_serialize_deserialize() {
386+
let schema = schema();
387+
let data_files = data_files();
380388

381389
let mut buffer = Vec::new();
382390
let _ = write_data_files_to_avro(
@@ -398,4 +406,30 @@ mod tests {
398406

399407
assert_eq!(data_files, actual_data_file);
400408
}
409+
410+
#[tokio::test]
411+
async fn test_data_file_serialize_deserialize_v1_data_on_v2_reader() {
412+
let schema = schema();
413+
let data_files = data_files();
414+
415+
let mut buffer = Vec::new();
416+
let _ = write_data_files_to_avro(
417+
&mut buffer,
418+
data_files.clone().into_iter(),
419+
&StructType::new(vec![]),
420+
FormatVersion::V1,
421+
)
422+
.unwrap();
423+
424+
let actual_data_file = read_data_files_from_avro(
425+
&mut Cursor::new(buffer),
426+
&schema,
427+
0,
428+
&StructType::new(vec![]),
429+
FormatVersion::V2,
430+
)
431+
.unwrap();
432+
433+
assert_eq!(actual_data_file[0].content, DataContentType::Data)
434+
}
401435
}

crates/iceberg/src/spec/manifest/entry.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ use typed_builder::TypedBuilder;
2424
use crate::avro::schema_to_avro_schema;
2525
use crate::error::Result;
2626
use crate::spec::{
27-
DataContentType, DataFile, INITIAL_SEQUENCE_NUMBER, ListType, ManifestFile, MapType,
28-
NestedField, NestedFieldRef, PrimitiveType, Schema, StructType, Type,
27+
DataContentType, DataFile, INITIAL_SEQUENCE_NUMBER, ListType, Literal, ManifestFile, MapType,
28+
NestedField, NestedFieldRef, PrimitiveLiteral, PrimitiveType, Schema, StructType, Type,
2929
};
3030
use crate::{Error, ErrorKind};
3131

@@ -232,11 +232,11 @@ static FILE_SEQUENCE_NUMBER: Lazy<NestedFieldRef> = {
232232

233233
static CONTENT: Lazy<NestedFieldRef> = {
234234
Lazy::new(|| {
235-
Arc::new(NestedField::required(
236-
134,
237-
"content",
238-
Type::Primitive(PrimitiveType::Int),
239-
))
235+
Arc::new(
236+
NestedField::required(134, "content", Type::Primitive(PrimitiveType::Int))
237+
// 0 refers to DataContentType::DATA
238+
.with_initial_default(Literal::Primitive(PrimitiveLiteral::Int(0))),
239+
)
240240
})
241241
};
242242

0 commit comments

Comments
 (0)