Skip to content

Commit 8d6cd76

Browse files
authored
Ensure page encoding statistics are written to Parquet file (#7643)
1 parent e5ad232 commit 8d6cd76

File tree

3 files changed

+49
-1
lines changed

3 files changed

+49
-1
lines changed

parquet/src/arrow/arrow_writer/mod.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1328,6 +1328,7 @@ mod tests {
13281328

13291329
use crate::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
13301330
use crate::arrow::ARROW_SCHEMA_META_KEY;
1331+
use crate::file::page_encoding_stats::PageEncodingStats;
13311332
use crate::format::PageHeader;
13321333
use crate::thrift::TCompactSliceInputProtocol;
13331334
use arrow::datatypes::ToByteSlice;
@@ -3835,4 +3836,48 @@ mod tests {
38353836
assert_eq!(stats.max_value.unwrap(), "Bm".as_bytes());
38363837
assert_eq!(stats.min_value.unwrap(), "Bl".as_bytes());
38373838
}
3839+
3840+
#[test]
3841+
fn test_page_encoding_statistics_roundtrip() {
3842+
let batch_schema = Schema::new(vec![Field::new(
3843+
"int32",
3844+
arrow_schema::DataType::Int32,
3845+
false,
3846+
)]);
3847+
3848+
let batch = RecordBatch::try_new(
3849+
Arc::new(batch_schema.clone()),
3850+
vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as _],
3851+
)
3852+
.unwrap();
3853+
3854+
let mut file: File = tempfile::tempfile().unwrap();
3855+
let mut writer = ArrowWriter::try_new(&mut file, Arc::new(batch_schema), None).unwrap();
3856+
writer.write(&batch).unwrap();
3857+
let file_metadata = writer.close().unwrap();
3858+
3859+
assert_eq!(file_metadata.row_groups.len(), 1);
3860+
assert_eq!(file_metadata.row_groups[0].columns.len(), 1);
3861+
let chunk_meta = file_metadata.row_groups[0].columns[0]
3862+
.meta_data
3863+
.as_ref()
3864+
.expect("column metadata missing");
3865+
assert!(chunk_meta.encoding_stats.is_some());
3866+
let chunk_page_stats = chunk_meta.encoding_stats.as_ref().unwrap();
3867+
3868+
// check that the read metadata is also correct
3869+
let options = ReadOptionsBuilder::new().with_page_index().build();
3870+
let reader = SerializedFileReader::new_with_options(file, options).unwrap();
3871+
3872+
let rowgroup = reader.get_row_group(0).expect("row group missing");
3873+
assert_eq!(rowgroup.num_columns(), 1);
3874+
let column = rowgroup.metadata().column(0);
3875+
assert!(column.page_encoding_stats().is_some());
3876+
let file_page_stats = column.page_encoding_stats().unwrap();
3877+
let chunk_stats: Vec<PageEncodingStats> = chunk_page_stats
3878+
.iter()
3879+
.map(|x| crate::file::page_encoding_stats::try_from_thrift(x).unwrap())
3880+
.collect();
3881+
assert_eq!(&chunk_stats, file_page_stats);
3882+
}
38383883
}

parquet/src/arrow/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ mod test {
481481
.unwrap();
482482
assert_eq!(
483483
err.to_string(),
484-
"EOF: Parquet file too small. Page index range 82..115 overlaps with file metadata 0..341"
484+
"EOF: Parquet file too small. Page index range 82..115 overlaps with file metadata 0..357"
485485
);
486486
}
487487

parquet/src/file/writer.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,9 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> {
689689
if let Some(statistics) = metadata.statistics() {
690690
builder = builder.set_statistics(statistics.clone())
691691
}
692+
if let Some(page_encoding_stats) = metadata.page_encoding_stats() {
693+
builder = builder.set_page_encoding_stats(page_encoding_stats.clone())
694+
}
692695
builder = self.set_column_crypto_metadata(builder, &metadata);
693696
close.metadata = builder.build()?;
694697

0 commit comments

Comments
 (0)