map_no_value.parquet is generated with parquet-rs version 53.2.0
using the following code:
fn main() {
use crate::data_type::Int32Type;
use crate::file::properties::{EnabledStatistics, WriterProperties};
use crate::file::writer::SerializedFileWriter;
use crate::schema::parser::parse_message_type;
use std::sync::Arc;
let schema = "
message schema {
REQUIRED group my_map (MAP) {
REPEATED group key_value {
REQUIRED INT32 key;
OPTIONAL INT32 value;
}
}
REQUIRED group my_map_no_v (MAP) {
REPEATED group key_value {
REQUIRED INT32 key;
}
}
REQUIRED group my_list (LIST) {
REPEATED group list {
REQUIRED INT32 element;
}
}
}
";
let schema = Arc::new(parse_message_type(schema).unwrap());
// Write Parquet file to buffer
let mut file = std::fs::File::create("/tmp/map_no_value.parquet").unwrap();
let props = Arc::new(
WriterProperties::builder()
.set_statistics_enabled(EnabledStatistics::None)
.build(),
);
let mut file_writer = SerializedFileWriter::new(&mut file, schema, props).unwrap();
let mut row_group_writer = file_writer.next_row_group().unwrap();
// Write column my_map.key_value.key
let mut column_writer = row_group_writer.next_column().unwrap().unwrap();
column_writer
.typed::<Int32Type>()
.write_batch(
&[1, 2, 3, 4, 5, 6, 7, 8, 9],
Some(&[1, 1, 1, 1, 1, 1, 1, 1, 1]),
Some(&[0, 1, 1, 0, 1, 1, 0, 1, 1]),
)
.unwrap();
column_writer.close().unwrap();
// Write column my_map.key_value.value
let mut column_writer = row_group_writer.next_column().unwrap().unwrap();
column_writer
.typed::<Int32Type>()
.write_batch(
&[],
Some(&[1, 1, 1, 1, 1, 1, 1, 1, 1]),
Some(&[0, 1, 1, 0, 1, 1, 0, 1, 1]),
)
.unwrap();
column_writer.close().unwrap();
// Write column my_map_no_v.key_value.key
let mut column_writer = row_group_writer.next_column().unwrap().unwrap();
column_writer
.typed::<Int32Type>()
.write_batch(
&[1, 2, 3, 4, 5, 6, 7, 8, 9],
Some(&[1, 1, 1, 1, 1, 1, 1, 1, 1]),
Some(&[0, 1, 1, 0, 1, 1, 0, 1, 1]),
)
.unwrap();
column_writer.close().unwrap();
// Write column my_list.list.element
let mut column_writer = row_group_writer.next_column().unwrap().unwrap();
column_writer
.typed::<Int32Type>()
.write_batch(
&[1, 2, 3, 4, 5, 6, 7, 8, 9],
Some(&[1, 1, 1, 1, 1, 1, 1, 1, 1]),
Some(&[0, 1, 1, 0, 1, 1, 0, 1, 1]),
)
.unwrap();
column_writer.close().unwrap();
// Finalize Parquet file
row_group_writer.close().unwrap();
file_writer.close().unwrap();
}
It contains a MAP with all null values, a second MAP without a values field, and
an equivalent LIST repeating the MAP keys. The first column is 3 MAP rows:
{1 -> null, 2 -> null, 3 -> null}
{4 -> null, 5 -> null, 6 -> null}
{7 -> null, 8 -> null, 9 -> null}
The last two columns comprise 3 equivalent rows of list<Integer>:
[1, 2, 3]
[4, 5, 6]
[7, 8, 9]
Here is the file metadata printed by parquet-cli:
File path: map_no_value.parquet
Created by: parquet-rs version 53.2.0
Properties: (none)
Schema:
message schema {
required group my_map (MAP) {
repeated group key_value {
required int32 key;
optional int32 value;
}
}
required group my_map_no_v (MAP) {
repeated group key_value {
required int32 key;
}
}
required group my_list (LIST) {
repeated group list {
required int32 element;
}
}
}
Row group 0: count: 3 105.00 B records start: 4 total(compressed): 315 B total(uncompressed):315 B
--------------------------------------------------------------------------------
type encodings count avg size nulls min / max
my_map.key_value.key INT32 _ RR_ 9 10.00 B
my_map.key_value.value INT32 _ RR_ 9 5.00 B
my_map_no_v.key_value.key INT32 _ RR_ 9 10.00 B
my_list.list.element INT32 _ RR_ 9 10.00 B