From e3a0b508876bc303f806b12810f13227463ef65d Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 20 Aug 2025 12:28:59 -0700 Subject: [PATCH 001/126] custom PageLocation decoder for speed --- parquet/src/file/page_index/offset_index.rs | 64 ++++++++++++++++++++- parquet/src/parquet_thrift.rs | 13 +++++ 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index d4c196a3ae8b..791f61d37eae 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -25,7 +25,7 @@ use crate::{ thrift_struct, }; -thrift_struct!( +/*thrift_struct!( /// Page location information for [`OffsetIndexMetaData`] pub struct PageLocation { /// Offset of the page in the file @@ -37,7 +37,67 @@ pub struct PageLocation { /// (repetition_level = 0). 3: required i64 first_row_index } -); +);*/ + +// hand coding this one because it is very time critical + +/// Page location information for [`OffsetIndexMetaData`] +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PageLocation { + /// Offset of the page in the file + pub offset: i64, + /// Size of the page, including header. Sum of compressed_page_size and header + pub compressed_page_size: i32, + /// Index within the RowGroup of the first row of the page. When an + /// OffsetIndex is present, pages must begin on row boundaries + /// (repetition_level = 0). + pub first_row_index: i64, +} + +// Note: this will fail if the fields are either out of order, or if a suboptimal +// encoder doesn't use field deltas. If that ever occurs, remove this code and +// revert to the commented out thrift_struct!() implementation above. +impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for PageLocation { + type Error = ParquetError; + fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { + // there are 3 fields, all mandatory, so all field deltas should be 1 + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::I64 as u8 { + return Err(general_err!("error reading PageLocation::offset")); + } + let offset = prot.read_i64()?; + + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::I32 as u8 { + return Err(general_err!( + "error reading PageLocation::compressed_page_size" + )); + } + let compressed_page_size = prot.read_i32()?; + + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::I64 as u8 { + return Err(general_err!("error reading PageLocation::first_row_index")); + } + let first_row_index = prot.read_i64()?; + + // This loop slows things down a bit, but it's an acceptible price to allow + // forwards compatibility. We could instead assert the next field is Stop. + loop { + let (field_type, _) = prot.read_field_header()?; + if field_type == FieldType::Stop as u8 { + break; + } + prot.skip(FieldType::try_from(field_type)?)?; + } + + Ok(Self { + offset, + compressed_page_size, + first_row_index, + }) + } +} impl From<&crate::format::PageLocation> for PageLocation { fn from(value: &crate::format::PageLocation) -> Self { diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 7f5fe475217f..2dff498372f0 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -244,6 +244,19 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> { Ok(()) } + // This is a specialized version of read_field_begin, solely for use in parsing + // PageLocation structs in the offset index. This function assumes that the delta + // field will always be less than 0xf, fields will be in order, and no boolean fields + // will be read. This also skips validation of the field type. + // + // Returns a tuple of (field_type, field_delta) + pub(crate) fn read_field_header(&mut self) -> Result<(u8, u8)> { + let field_type = self.read_byte()?; + let field_delta = (field_type & 0xf0) >> 4; + let field_type = field_type & 0xf; + Ok((field_type, field_delta)) + } + pub(crate) fn read_field_begin(&mut self) -> Result { // we can read at least one byte, which is: // - the type From 71d3859642701c3f90f3a16f5ae34582f5c00b85 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 20 Aug 2025 12:55:50 -0700 Subject: [PATCH 002/126] fix recently added test --- parquet/tests/arrow_reader/io/mod.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/parquet/tests/arrow_reader/io/mod.rs b/parquet/tests/arrow_reader/io/mod.rs index b31f295755b0..3a09181c72cf 100644 --- a/parquet/tests/arrow_reader/io/mod.rs +++ b/parquet/tests/arrow_reader/io/mod.rs @@ -49,7 +49,6 @@ use parquet::data_type::AsBytes; use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetOffsetIndex}; use parquet::file::properties::WriterProperties; use parquet::file::FOOTER_SIZE; -use parquet::format::PageLocation; use parquet::schema::types::SchemaDescriptor; use std::collections::BTreeMap; use std::fmt::Display; @@ -257,7 +256,7 @@ struct TestColumnChunk { dictionary_page_location: Option, /// The location of the data pages in the file - page_locations: Vec, + page_locations: Vec, } /// Information about the pages in a single row group @@ -296,6 +295,11 @@ impl TestRowGroups { let start_offset = start_offset as usize; let end_offset = start_offset + length as usize; + let page_locations = page_locations + .iter() + .map(|loc| parquet::format::PageLocation::from(loc)) + .collect(); + TestColumnChunk { name: column_name.clone(), location: start_offset..end_offset, From ff42e5a86bce951c287794748b55fce7f74dad51 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 20 Aug 2025 13:12:51 -0700 Subject: [PATCH 003/126] clippy --- parquet/tests/arrow_reader/io/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/tests/arrow_reader/io/mod.rs b/parquet/tests/arrow_reader/io/mod.rs index 3a09181c72cf..65a014967b4a 100644 --- a/parquet/tests/arrow_reader/io/mod.rs +++ b/parquet/tests/arrow_reader/io/mod.rs @@ -297,7 +297,7 @@ impl TestRowGroups { let page_locations = page_locations .iter() - .map(|loc| parquet::format::PageLocation::from(loc)) + .map(parquet::format::PageLocation::from) .collect(); TestColumnChunk { From 1f2c2161c4f554ffb56385627dde5a4af2abcbcf Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 20 Aug 2025 13:38:46 -0700 Subject: [PATCH 004/126] experimental new form for column index --- parquet/src/file/page_index/index_reader.rs | 235 +++++++++++++++++++- 1 file changed, 224 insertions(+), 11 deletions(-) diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index fbe6d3984596..fe56d4880d55 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -18,14 +18,17 @@ //! Support for reading [`Index`] and [`OffsetIndexMetaData`] from parquet metadata. use crate::basic::{BoundaryOrder, Type}; -use crate::data_type::Int96; +use crate::data_type::private::ParquetValueType; +use crate::data_type::{ByteArray, FixedLenByteArray, Int96}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ColumnChunkMetaData; -use crate::file::page_index::index::{Index, NativeIndex}; +use crate::file::page_index::index::Index; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::ChunkReader; use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; use crate::thrift_struct; +use crate::util::bit_util::*; +use std::marker::PhantomData; use std::ops::Range; /// Computes the covering range of two optional ranges @@ -146,22 +149,232 @@ pub(crate) struct ColumnIndex<'a> { } ); +/// column index +pub struct NativeColumnIndex { + phantom_data: PhantomData, + null_pages: Vec, + boundary_order: BoundaryOrder, + null_counts: Option>, + repetition_level_histograms: Option>, + definition_level_histograms: Option>, + // raw bytes for min and max values + min_bytes: Vec, + min_offsets: Vec, // offsets are really only needed for BYTE_ARRAY + max_bytes: Vec, + max_offsets: Vec, +} + +impl NativeColumnIndex { + fn try_new(index: ColumnIndex) -> Result { + let len = index.null_pages.len(); + + let min_len = index.min_values.iter().map(|&v| v.len()).sum(); + let max_len = index.max_values.iter().map(|&v| v.len()).sum(); + let mut min_bytes = vec![0u8; min_len]; + let mut max_bytes = vec![0u8; max_len]; + + let mut min_offsets = vec![0usize; len + 1]; + let mut max_offsets = vec![0usize; len + 1]; + + let mut min_pos = 0; + let mut max_pos = 0; + + for (i, is_null) in index.null_pages.iter().enumerate().take(len) { + if !is_null { + let min = index.min_values[i]; + let dst = &mut min_bytes[min_pos..min_pos + min.len()]; + dst.copy_from_slice(min); + min_offsets[i] = min_pos; + min_pos += min.len(); + + let max = index.max_values[i]; + let dst = &mut max_bytes[max_pos..max_pos + min.len()]; + dst.copy_from_slice(max); + max_offsets[i] = max_pos; + max_pos += max.len(); + } else { + min_offsets[i] = min_pos; + max_offsets[i] = max_pos; + } + } + + min_offsets[len] = min_pos; + max_offsets[len] = max_pos; + + Ok(Self { + phantom_data: PhantomData, + null_pages: index.null_pages, + boundary_order: index.boundary_order, + null_counts: index.null_counts, + repetition_level_histograms: index.repetition_level_histograms, + definition_level_histograms: index.definition_level_histograms, + min_bytes, + min_offsets, + max_bytes, + max_offsets, + }) + } + + /// Returns the number of pages + pub fn num_pages(&self) -> u64 { + self.null_pages.len() as u64 + } + + /// Returns the number of null values in the page indexed by `idx` + pub fn null_count(&self, idx: usize) -> Option { + self.null_counts.as_ref().map(|nc| nc[idx]) + } + + /// Returns the repetition level histogram for the page indexed by `idx` + pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> { + if let Some(rep_hists) = self.repetition_level_histograms.as_ref() { + let num_lvls = rep_hists.len() / self.num_pages() as usize; + let start = num_lvls * idx; + Some(&rep_hists[start..start + num_lvls]) + } else { + None + } + } + + /// Returns the definition level histogram for the page indexed by `idx` + pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> { + if let Some(def_hists) = self.definition_level_histograms.as_ref() { + let num_lvls = def_hists.len() / self.num_pages() as usize; + let start = num_lvls * idx; + Some(&def_hists[start..start + num_lvls]) + } else { + None + } + } + + /// Returns whether this is an all null page + pub fn is_null_page(&self, idx: usize) -> bool { + self.null_pages[idx] + } + + /// Returns the minimum value in the page indexed by `idx` as raw bytes + /// + /// It is `None` when all values are null + pub fn min_value_bytes(&self, idx: usize) -> Option<&[u8]> { + if self.null_pages[idx] { + None + } else { + let start = self.min_offsets[idx]; + let end = self.min_offsets[idx + 1]; + Some(&self.min_bytes[start..end]) + } + } + + /// Returns the maximum value in the page indexed by `idx` as raw bytes + /// + /// It is `None` when all values are null + pub fn max_value_bytes(&self, idx: usize) -> Option<&[u8]> { + if self.null_pages[idx] { + None + } else { + let start = self.max_offsets[idx]; + let end = self.max_offsets[idx + 1]; + Some(&self.max_bytes[start..end]) + } + } +} + +macro_rules! min_max_values { + ($ty: ty) => { + impl NativeColumnIndex<$ty> { + /// Returns the minimum value in the page indexed by `idx` + /// + /// It is `None` when all values are null + pub fn min_value(&self, idx: usize) -> Option<$ty> { + <$ty>::try_from_le_slice(self.min_value_bytes(idx)?).ok() + } + + /// Returns the maximum value in the page indexed by `idx` + /// + /// It is `None` when all values are null + pub fn max_value(&self, idx: usize) -> Option<$ty> { + <$ty>::try_from_le_slice(self.max_value_bytes(idx)?).ok() + } + } + }; +} + +min_max_values!(bool); +min_max_values!(i32); +min_max_values!(i64); +min_max_values!(f32); +min_max_values!(f64); +min_max_values!(Int96); + +/// index +#[allow(non_camel_case_types)] +pub enum ColumnIndexMetaData { + /// Sometimes reading page index from parquet file + /// will only return pageLocations without min_max index, + /// `NONE` represents this lack of index information + NONE, + /// Boolean type index + BOOLEAN(NativeColumnIndex), + /// 32-bit integer type index + INT32(NativeColumnIndex), + /// 64-bit integer type index + INT64(NativeColumnIndex), + /// 96-bit integer type (timestamp) index + INT96(NativeColumnIndex), + /// 32-bit floating point type index + FLOAT(NativeColumnIndex), + /// 64-bit floating point type index + DOUBLE(NativeColumnIndex), + /// Byte array type index + BYTE_ARRAY(NativeColumnIndex), + /// Fixed length byte array type index + FIXED_LEN_BYTE_ARRAY(NativeColumnIndex), +} + +impl ColumnIndexMetaData { + /// Return min/max elements inside ColumnIndex are ordered or not. + pub fn is_sorted(&self) -> bool { + // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING, + if let Some(order) = self.get_boundary_order() { + order != BoundaryOrder::UNORDERED + } else { + false + } + } + + /// Get boundary_order of this page index. + pub fn get_boundary_order(&self) -> Option { + match self { + ColumnIndexMetaData::NONE => None, + ColumnIndexMetaData::BOOLEAN(index) => Some(index.boundary_order), + ColumnIndexMetaData::INT32(index) => Some(index.boundary_order), + ColumnIndexMetaData::INT64(index) => Some(index.boundary_order), + ColumnIndexMetaData::INT96(index) => Some(index.boundary_order), + ColumnIndexMetaData::FLOAT(index) => Some(index.boundary_order), + ColumnIndexMetaData::DOUBLE(index) => Some(index.boundary_order), + ColumnIndexMetaData::BYTE_ARRAY(index) => Some(index.boundary_order), + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order), + } + } +} + pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> Result { let mut prot = ThriftCompactInputProtocol::new(data); let index = ColumnIndex::try_from(&mut prot)?; let index = match column_type { - Type::BOOLEAN => Index::BOOLEAN(NativeIndex::::try_new_local(index)?), - Type::INT32 => Index::INT32(NativeIndex::::try_new_local(index)?), - Type::INT64 => Index::INT64(NativeIndex::::try_new_local(index)?), - Type::INT96 => Index::INT96(NativeIndex::::try_new_local(index)?), - Type::FLOAT => Index::FLOAT(NativeIndex::::try_new_local(index)?), - Type::DOUBLE => Index::DOUBLE(NativeIndex::::try_new_local(index)?), - Type::BYTE_ARRAY => Index::BYTE_ARRAY(NativeIndex::try_new_local(index)?), + Type::BOOLEAN => ColumnIndexMetaData::BOOLEAN(NativeColumnIndex::::try_new(index)?), + Type::INT32 => ColumnIndexMetaData::INT32(NativeColumnIndex::::try_new(index)?), + Type::INT64 => ColumnIndexMetaData::INT64(NativeColumnIndex::::try_new(index)?), + Type::INT96 => ColumnIndexMetaData::INT96(NativeColumnIndex::::try_new(index)?), + Type::FLOAT => ColumnIndexMetaData::FLOAT(NativeColumnIndex::::try_new(index)?), + Type::DOUBLE => ColumnIndexMetaData::DOUBLE(NativeColumnIndex::::try_new(index)?), + Type::BYTE_ARRAY => ColumnIndexMetaData::BYTE_ARRAY(NativeColumnIndex::try_new(index)?), Type::FIXED_LEN_BYTE_ARRAY => { - Index::FIXED_LEN_BYTE_ARRAY(NativeIndex::try_new_local(index)?) + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(NativeColumnIndex::try_new(index)?) } }; - Ok(index) + //Ok(index) + Ok(Index::NONE) } From 37f3b2086b3108097d55831ae577f6251103bc75 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 21 Aug 2025 11:27:42 -0700 Subject: [PATCH 005/126] fix for test added in main --- parquet/tests/arrow_reader/io/mod.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/parquet/tests/arrow_reader/io/mod.rs b/parquet/tests/arrow_reader/io/mod.rs index b31f295755b0..9cafcd714e89 100644 --- a/parquet/tests/arrow_reader/io/mod.rs +++ b/parquet/tests/arrow_reader/io/mod.rs @@ -49,7 +49,6 @@ use parquet::data_type::AsBytes; use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetOffsetIndex}; use parquet::file::properties::WriterProperties; use parquet::file::FOOTER_SIZE; -use parquet::format::PageLocation; use parquet::schema::types::SchemaDescriptor; use std::collections::BTreeMap; use std::fmt::Display; @@ -257,7 +256,7 @@ struct TestColumnChunk { dictionary_page_location: Option, /// The location of the data pages in the file - page_locations: Vec, + page_locations: Vec, } /// Information about the pages in a single row group @@ -287,8 +286,11 @@ impl TestRowGroups { .enumerate() .map(|(col_idx, col_meta)| { let column_name = col_meta.column_descr().name().to_string(); - let page_locations = - offset_index[rg_index][col_idx].page_locations().to_vec(); + let page_locations = offset_index[rg_index][col_idx] + .page_locations() + .iter() + .map(parquet::format::PageLocation::from) + .collect(); let dictionary_page_location = col_meta.dictionary_page_offset(); // We can find the byte range of the entire column chunk From 3d4e28eade9e5ab4066c0cd4e91311f068d12572 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 21 Aug 2025 11:29:53 -0700 Subject: [PATCH 006/126] refactor new column index --- parquet/src/file/page_index/index.rs | 5 +- parquet/src/file/page_index/index_reader.rs | 353 ++++++++++++++------ 2 files changed, 247 insertions(+), 111 deletions(-) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index ed586bcd33d0..22d6e92666db 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -24,7 +24,7 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96}; use crate::errors::ParquetError; use crate::file::metadata::LevelHistogram; -use crate::file::page_index::index_reader::ColumnIndex; +use crate::file::page_index::index_reader::ThriftColumnIndex; use std::fmt::Debug; /// Typed statistics for one data page @@ -310,7 +310,8 @@ impl NativeIndex { } /// Creates a new [`NativeIndex`] - pub(crate) fn try_new_local(index: ColumnIndex) -> Result { + #[allow(dead_code)] + pub(crate) fn try_new_local(index: ThriftColumnIndex) -> Result { let len = index.min_values.len(); // turn Option> into Vec> diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index fe56d4880d55..1680f9ddc0ea 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -19,7 +19,7 @@ use crate::basic::{BoundaryOrder, Type}; use crate::data_type::private::ParquetValueType; -use crate::data_type::{ByteArray, FixedLenByteArray, Int96}; +use crate::data_type::Int96; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ColumnChunkMetaData; use crate::file::page_index::index::Index; @@ -27,9 +27,7 @@ use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::ChunkReader; use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; use crate::thrift_struct; -use crate::util::bit_util::*; -use std::marker::PhantomData; -use std::ops::Range; +use std::ops::{Deref, Range}; /// Computes the covering range of two optional ranges /// @@ -138,7 +136,7 @@ pub(crate) fn decode_offset_index(data: &[u8]) -> Result { +pub(crate) struct ThriftColumnIndex<'a> { 1: required list null_pages 2: required list<'a> min_values 3: required list<'a> max_values @@ -149,23 +147,149 @@ pub(crate) struct ColumnIndex<'a> { } ); -/// column index -pub struct NativeColumnIndex { - phantom_data: PhantomData, +// TODO: the following should move to its own module + +/// Common bits of the column index +pub struct ColumnIndex { null_pages: Vec, boundary_order: BoundaryOrder, null_counts: Option>, repetition_level_histograms: Option>, definition_level_histograms: Option>, +} + +impl ColumnIndex { + /// Returns the number of pages + pub fn num_pages(&self) -> u64 { + self.null_pages.len() as u64 + } + + /// Returns the number of null values in the page indexed by `idx` + pub fn null_count(&self, idx: usize) -> Option { + self.null_counts.as_ref().map(|nc| nc[idx]) + } + + /// Returns the repetition level histogram for the page indexed by `idx` + pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> { + if let Some(rep_hists) = self.repetition_level_histograms.as_ref() { + let num_lvls = rep_hists.len() / self.num_pages() as usize; + let start = num_lvls * idx; + Some(&rep_hists[start..start + num_lvls]) + } else { + None + } + } + + /// Returns the definition level histogram for the page indexed by `idx` + pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> { + if let Some(def_hists) = self.definition_level_histograms.as_ref() { + let num_lvls = def_hists.len() / self.num_pages() as usize; + let start = num_lvls * idx; + Some(&def_hists[start..start + num_lvls]) + } else { + None + } + } + + /// Returns whether the page indexed by `idx` consists of all null values + pub fn is_null_page(&self, idx: usize) -> bool { + self.null_pages[idx] + } +} + +/// Column index for primitive types +pub struct PrimitiveColumnIndex { + column_index: ColumnIndex, + min_values: Vec, + max_values: Vec, +} + +impl PrimitiveColumnIndex { + fn try_new(index: ThriftColumnIndex) -> Result { + let len = index.null_pages.len(); + + let mut min_values = Vec::with_capacity(len); + let mut max_values = Vec::with_capacity(len); + + for (i, is_null) in index.null_pages.iter().enumerate().take(len) { + if !is_null { + let min = index.min_values[i]; + min_values.push(T::try_from_le_slice(min)?); + + let max = index.max_values[i]; + max_values.push(T::try_from_le_slice(max)?); + } else { + min_values.push(Default::default()); + max_values.push(Default::default()); + } + } + + Ok(Self { + column_index: ColumnIndex { + null_pages: index.null_pages, + boundary_order: index.boundary_order, + null_counts: index.null_counts, + repetition_level_histograms: index.repetition_level_histograms, + definition_level_histograms: index.definition_level_histograms, + }, + min_values, + max_values, + }) + } + + /// Returns an array containing the min values for each page + pub fn min_values(&self) -> &[T] { + &self.min_values + } + + /// Returns an array containing the max values for each page + pub fn max_values(&self) -> &[T] { + &self.max_values + } + + /// Returns the min value for the page indexed by `idx` + /// + /// It is `None` when all values are null + pub fn min_value(&self, idx: usize) -> Option<&T> { + if self.null_pages[idx] { + None + } else { + Some(&self.min_values[idx]) + } + } + + /// Returns the max value for the page indexed by `idx` + /// + /// It is `None` when all values are null + pub fn max_value(&self, idx: usize) -> Option<&T> { + if self.null_pages[idx] { + None + } else { + Some(&self.max_values[idx]) + } + } +} + +impl Deref for PrimitiveColumnIndex { + type Target = ColumnIndex; + + fn deref(&self) -> &Self::Target { + &self.column_index + } +} + +/// Column index for byte arrays (fixed length and variable) +pub struct ByteArrayColumnIndex { + column_index: ColumnIndex, // raw bytes for min and max values min_bytes: Vec, - min_offsets: Vec, // offsets are really only needed for BYTE_ARRAY + min_offsets: Vec, max_bytes: Vec, max_offsets: Vec, } -impl NativeColumnIndex { - fn try_new(index: ColumnIndex) -> Result { +impl ByteArrayColumnIndex { + fn try_new(index: ThriftColumnIndex) -> Result { let len = index.null_pages.len(); let min_len = index.min_values.iter().map(|&v| v.len()).sum(); @@ -202,12 +326,14 @@ impl NativeColumnIndex { max_offsets[len] = max_pos; Ok(Self { - phantom_data: PhantomData, - null_pages: index.null_pages, - boundary_order: index.boundary_order, - null_counts: index.null_counts, - repetition_level_histograms: index.repetition_level_histograms, - definition_level_histograms: index.definition_level_histograms, + column_index: ColumnIndex { + null_pages: index.null_pages, + boundary_order: index.boundary_order, + null_counts: index.null_counts, + repetition_level_histograms: index.repetition_level_histograms, + definition_level_histograms: index.definition_level_histograms, + }, + min_bytes, min_offsets, max_bytes, @@ -215,47 +341,10 @@ impl NativeColumnIndex { }) } - /// Returns the number of pages - pub fn num_pages(&self) -> u64 { - self.null_pages.len() as u64 - } - - /// Returns the number of null values in the page indexed by `idx` - pub fn null_count(&self, idx: usize) -> Option { - self.null_counts.as_ref().map(|nc| nc[idx]) - } - - /// Returns the repetition level histogram for the page indexed by `idx` - pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> { - if let Some(rep_hists) = self.repetition_level_histograms.as_ref() { - let num_lvls = rep_hists.len() / self.num_pages() as usize; - let start = num_lvls * idx; - Some(&rep_hists[start..start + num_lvls]) - } else { - None - } - } - - /// Returns the definition level histogram for the page indexed by `idx` - pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> { - if let Some(def_hists) = self.definition_level_histograms.as_ref() { - let num_lvls = def_hists.len() / self.num_pages() as usize; - let start = num_lvls * idx; - Some(&def_hists[start..start + num_lvls]) - } else { - None - } - } - - /// Returns whether this is an all null page - pub fn is_null_page(&self, idx: usize) -> bool { - self.null_pages[idx] - } - - /// Returns the minimum value in the page indexed by `idx` as raw bytes + /// Returns the min value for the page indexed by `idx` /// /// It is `None` when all values are null - pub fn min_value_bytes(&self, idx: usize) -> Option<&[u8]> { + pub fn min_value(&self, idx: usize) -> Option<&[u8]> { if self.null_pages[idx] { None } else { @@ -265,10 +354,10 @@ impl NativeColumnIndex { } } - /// Returns the maximum value in the page indexed by `idx` as raw bytes + /// Returns the max value for the page indexed by `idx` /// /// It is `None` when all values are null - pub fn max_value_bytes(&self, idx: usize) -> Option<&[u8]> { + pub fn max_value(&self, idx: usize) -> Option<&[u8]> { if self.null_pages[idx] { None } else { @@ -279,32 +368,51 @@ impl NativeColumnIndex { } } -macro_rules! min_max_values { - ($ty: ty) => { - impl NativeColumnIndex<$ty> { - /// Returns the minimum value in the page indexed by `idx` - /// - /// It is `None` when all values are null - pub fn min_value(&self, idx: usize) -> Option<$ty> { - <$ty>::try_from_le_slice(self.min_value_bytes(idx)?).ok() - } +impl Deref for ByteArrayColumnIndex { + type Target = ColumnIndex; - /// Returns the maximum value in the page indexed by `idx` - /// - /// It is `None` when all values are null - pub fn max_value(&self, idx: usize) -> Option<$ty> { - <$ty>::try_from_le_slice(self.max_value_bytes(idx)?).ok() - } - } - }; + fn deref(&self) -> &Self::Target { + &self.column_index + } } -min_max_values!(bool); -min_max_values!(i32); -min_max_values!(i64); -min_max_values!(f32); -min_max_values!(f64); -min_max_values!(Int96); +// Macro to generate getter functions for ColumnIndexMetaData. +macro_rules! colidx_enum_func { + ($self:ident, $func:ident, $arg:ident) => {{ + match *$self { + Self::BOOLEAN(ref typed) => typed.$func($arg), + Self::INT32(ref typed) => typed.$func($arg), + Self::INT64(ref typed) => typed.$func($arg), + Self::INT96(ref typed) => typed.$func($arg), + Self::FLOAT(ref typed) => typed.$func($arg), + Self::DOUBLE(ref typed) => typed.$func($arg), + Self::BYTE_ARRAY(ref typed) => typed.$func($arg), + Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func($arg), + _ => panic!(concat!( + "Cannot call ", + stringify!($func), + " on ColumnIndexMetaData::NONE" + )), + } + }}; + ($self:ident, $func:ident) => {{ + match *$self { + Self::BOOLEAN(ref typed) => typed.$func(), + Self::INT32(ref typed) => typed.$func(), + Self::INT64(ref typed) => typed.$func(), + Self::INT96(ref typed) => typed.$func(), + Self::FLOAT(ref typed) => typed.$func(), + Self::DOUBLE(ref typed) => typed.$func(), + Self::BYTE_ARRAY(ref typed) => typed.$func(), + Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func(), + _ => panic!(concat!( + "Cannot call ", + stringify!($func), + " on ColumnIndexMetaData::NONE" + )), + } + }}; +} /// index #[allow(non_camel_case_types)] @@ -314,21 +422,21 @@ pub enum ColumnIndexMetaData { /// `NONE` represents this lack of index information NONE, /// Boolean type index - BOOLEAN(NativeColumnIndex), + BOOLEAN(PrimitiveColumnIndex), /// 32-bit integer type index - INT32(NativeColumnIndex), + INT32(PrimitiveColumnIndex), /// 64-bit integer type index - INT64(NativeColumnIndex), + INT64(PrimitiveColumnIndex), /// 96-bit integer type (timestamp) index - INT96(NativeColumnIndex), + INT96(PrimitiveColumnIndex), /// 32-bit floating point type index - FLOAT(NativeColumnIndex), + FLOAT(PrimitiveColumnIndex), /// 64-bit floating point type index - DOUBLE(NativeColumnIndex), + DOUBLE(PrimitiveColumnIndex), /// Byte array type index - BYTE_ARRAY(NativeColumnIndex), + BYTE_ARRAY(ByteArrayColumnIndex), /// Fixed length byte array type index - FIXED_LEN_BYTE_ARRAY(NativeColumnIndex), + FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex), } impl ColumnIndexMetaData { @@ -345,33 +453,60 @@ impl ColumnIndexMetaData { /// Get boundary_order of this page index. pub fn get_boundary_order(&self) -> Option { match self { - ColumnIndexMetaData::NONE => None, - ColumnIndexMetaData::BOOLEAN(index) => Some(index.boundary_order), - ColumnIndexMetaData::INT32(index) => Some(index.boundary_order), - ColumnIndexMetaData::INT64(index) => Some(index.boundary_order), - ColumnIndexMetaData::INT96(index) => Some(index.boundary_order), - ColumnIndexMetaData::FLOAT(index) => Some(index.boundary_order), - ColumnIndexMetaData::DOUBLE(index) => Some(index.boundary_order), - ColumnIndexMetaData::BYTE_ARRAY(index) => Some(index.boundary_order), - ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order), + Self::NONE => None, + Self::BOOLEAN(index) => Some(index.boundary_order), + Self::INT32(index) => Some(index.boundary_order), + Self::INT64(index) => Some(index.boundary_order), + Self::INT96(index) => Some(index.boundary_order), + Self::FLOAT(index) => Some(index.boundary_order), + Self::DOUBLE(index) => Some(index.boundary_order), + Self::BYTE_ARRAY(index) => Some(index.boundary_order), + Self::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order), } } + + /// Returns the number of pages + pub fn num_pages(&self) -> u64 { + colidx_enum_func!(self, num_pages) + } + + /// Returns the number of null values in the page indexed by `idx` + pub fn null_count(&self, idx: usize) -> Option { + colidx_enum_func!(self, null_count, idx) + } + + /// Returns the repetition level histogram for the page indexed by `idx` + pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> { + colidx_enum_func!(self, repetition_level_histogram, idx) + } + + /// Returns the definition level histogram for the page indexed by `idx` + pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> { + colidx_enum_func!(self, definition_level_histogram, idx) + } + + /// Returns whether the page indexed by `idx` consists of all null values + pub fn is_null_page(&self, idx: usize) -> bool { + colidx_enum_func!(self, is_null_page, idx) + } } pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> Result { let mut prot = ThriftCompactInputProtocol::new(data); - let index = ColumnIndex::try_from(&mut prot)?; - - let index = match column_type { - Type::BOOLEAN => ColumnIndexMetaData::BOOLEAN(NativeColumnIndex::::try_new(index)?), - Type::INT32 => ColumnIndexMetaData::INT32(NativeColumnIndex::::try_new(index)?), - Type::INT64 => ColumnIndexMetaData::INT64(NativeColumnIndex::::try_new(index)?), - Type::INT96 => ColumnIndexMetaData::INT96(NativeColumnIndex::::try_new(index)?), - Type::FLOAT => ColumnIndexMetaData::FLOAT(NativeColumnIndex::::try_new(index)?), - Type::DOUBLE => ColumnIndexMetaData::DOUBLE(NativeColumnIndex::::try_new(index)?), - Type::BYTE_ARRAY => ColumnIndexMetaData::BYTE_ARRAY(NativeColumnIndex::try_new(index)?), + let index = ThriftColumnIndex::try_from(&mut prot)?; + + let _index = match column_type { + Type::BOOLEAN => { + ColumnIndexMetaData::BOOLEAN(PrimitiveColumnIndex::::try_new(index)?) + } + Type::INT32 => ColumnIndexMetaData::INT32(PrimitiveColumnIndex::::try_new(index)?), + Type::INT64 => ColumnIndexMetaData::INT64(PrimitiveColumnIndex::::try_new(index)?), + Type::INT96 => ColumnIndexMetaData::INT96(PrimitiveColumnIndex::::try_new(index)?), + Type::FLOAT => ColumnIndexMetaData::FLOAT(PrimitiveColumnIndex::::try_new(index)?), + Type::DOUBLE => ColumnIndexMetaData::DOUBLE(PrimitiveColumnIndex::::try_new(index)?), + Type::BYTE_ARRAY => ColumnIndexMetaData::BYTE_ARRAY(ByteArrayColumnIndex::try_new(index)?), Type::FIXED_LEN_BYTE_ARRAY => { - ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(NativeColumnIndex::try_new(index)?) + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex::try_new(index)?) } }; From 2b85b89733fafa586287359968ba18e8acb0cef4 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 21 Aug 2025 14:54:46 -0700 Subject: [PATCH 007/126] checkpoint...everything but stats converter --- parquet/src/arrow/arrow_writer/mod.rs | 13 ++- parquet/src/file/metadata/memory.rs | 50 ++++++++++ parquet/src/file/metadata/mod.rs | 24 +++-- parquet/src/file/metadata/reader.rs | 8 +- parquet/src/file/metadata/writer.rs | 35 ++++--- parquet/src/file/page_index/index_reader.rs | 103 ++++++++++++++++---- parquet/src/file/serialized_reader.rs | 95 ++++++++++-------- parquet/src/file/writer.rs | 27 ++--- parquet/tests/encryption/encryption_util.rs | 10 +- 9 files changed, 258 insertions(+), 107 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index c6b0b426f9dd..1041a1af1f77 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1488,6 +1488,7 @@ mod tests { use crate::arrow::ARROW_SCHEMA_META_KEY; use crate::column::page::{Page, PageReader}; use crate::file::page_encoding_stats::PageEncodingStats; + use crate::file::page_index::index_reader::ColumnIndexMetaData; use crate::file::reader::SerializedPageReader; use crate::format::PageHeader; use crate::schema::types::ColumnPath; @@ -1507,7 +1508,6 @@ mod tests { use crate::basic::Encoding; use crate::data_type::AsBytes; use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData, ParquetMetaDataReader}; - use crate::file::page_index::index::Index; use crate::file::properties::{ BloomFilterPosition, EnabledStatistics, ReaderProperties, WriterVersion, }; @@ -4002,9 +4002,12 @@ mod tests { assert_eq!(column_index[0].len(), 2); // 2 columns let a_idx = &column_index[0][0]; - assert!(matches!(a_idx, Index::BYTE_ARRAY(_)), "{a_idx:?}"); + assert!( + matches!(a_idx, ColumnIndexMetaData::BYTE_ARRAY(_)), + "{a_idx:?}" + ); let b_idx = &column_index[0][1]; - assert!(matches!(b_idx, Index::NONE), "{b_idx:?}"); + assert!(matches!(b_idx, ColumnIndexMetaData::NONE), "{b_idx:?}"); } #[test] @@ -4070,9 +4073,9 @@ mod tests { assert_eq!(column_index[0].len(), 2); // 2 columns let a_idx = &column_index[0][0]; - assert!(matches!(a_idx, Index::NONE), "{a_idx:?}"); + assert!(matches!(a_idx, ColumnIndexMetaData::NONE), "{a_idx:?}"); let b_idx = &column_index[0][1]; - assert!(matches!(b_idx, Index::NONE), "{b_idx:?}"); + assert!(matches!(b_idx, ColumnIndexMetaData::NONE), "{b_idx:?}"); } #[test] diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 0b8d3b336fc0..5c1477e2cb14 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -25,6 +25,9 @@ use crate::file::metadata::{ }; use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::page_index::index::{Index, NativeIndex, PageIndex}; +use crate::file::page_index::index_reader::{ + ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex, +}; use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use crate::file::statistics::{Statistics, ValueStatistics}; use std::sync::Arc; @@ -154,6 +157,48 @@ impl HeapSize for OffsetIndexMetaData { } } +impl HeapSize for ColumnIndexMetaData { + fn heap_size(&self) -> usize { + match self { + Self::NONE => 0, + Self::BOOLEAN(native_index) => native_index.heap_size(), + Self::INT32(native_index) => native_index.heap_size(), + Self::INT64(native_index) => native_index.heap_size(), + Self::INT96(native_index) => native_index.heap_size(), + Self::FLOAT(native_index) => native_index.heap_size(), + Self::DOUBLE(native_index) => native_index.heap_size(), + Self::BYTE_ARRAY(native_index) => native_index.heap_size(), + Self::FIXED_LEN_BYTE_ARRAY(native_index) => native_index.heap_size(), + } + } +} + +impl HeapSize for ColumnIndex { + fn heap_size(&self) -> usize { + self.null_pages.heap_size() + + self.boundary_order.heap_size() + + self.null_counts.heap_size() + + self.definition_level_histograms.heap_size() + + self.repetition_level_histograms.heap_size() + } +} + +impl HeapSize for PrimitiveColumnIndex { + fn heap_size(&self) -> usize { + self.column_index.heap_size() + self.min_values.heap_size() + self.max_values.heap_size() + } +} + +impl HeapSize for ByteArrayColumnIndex { + fn heap_size(&self) -> usize { + self.column_index.heap_size() + + self.min_bytes.heap_size() + + self.min_offsets.heap_size() + + self.max_bytes.heap_size() + + self.max_offsets.heap_size() + } +} + impl HeapSize for Index { fn heap_size(&self) -> usize { match self { @@ -193,6 +238,11 @@ impl HeapSize for bool { 0 // no heap allocations } } +impl HeapSize for u8 { + fn heap_size(&self) -> usize { + 0 // no heap allocations + } +} impl HeapSize for i32 { fn heap_size(&self) -> usize { 0 // no heap allocations diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index f2fe9de77e72..a619d76658e9 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -106,7 +106,7 @@ use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::{ page_encoding_stats::{self, PageEncodingStats}, - page_index::offset_index::PageLocation, + page_index::{index_reader::ColumnIndexMetaData, offset_index::PageLocation}, }; use crate::file::{ page_index::index::PageIndex, @@ -156,7 +156,7 @@ pub(crate) use writer::ThriftMetadataWriter; /// /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md /// [`ColumnIndex`]: crate::format::ColumnIndex -pub type ParquetColumnIndex = Vec>; +pub type ParquetColumnIndex = Vec>; /// [`OffsetIndexMetaData`] for each data page of each row group of each column /// @@ -1948,7 +1948,7 @@ impl OffsetIndexBuilder { mod tests { use super::*; use crate::basic::{PageType, SortOrder}; - use crate::file::page_index::index::NativeIndex; + use crate::file::page_index::index_reader::{ColumnIndex, PrimitiveColumnIndex}; #[test] fn test_row_group_metadata_thrift_conversion() { @@ -2223,7 +2223,17 @@ mod tests { let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN); column_index.append(false, vec![1u8], vec![2u8, 3u8], 4); let column_index = column_index.build_to_thrift(); - let native_index = NativeIndex::::try_new(column_index).unwrap(); + let native_index = PrimitiveColumnIndex:: { + column_index: ColumnIndex { + null_pages: column_index.null_pages, + boundary_order: column_index.boundary_order.try_into().unwrap(), + null_counts: column_index.null_counts, + repetition_level_histograms: column_index.repetition_level_histograms, + definition_level_histograms: column_index.definition_level_histograms, + }, + min_values: vec![], + max_values: vec![], + }; // Now, add in OffsetIndex let mut offset_index = OffsetIndexBuilder::new(); @@ -2237,16 +2247,16 @@ mod tests { let parquet_meta = ParquetMetaDataBuilder::new(file_metadata) .set_row_groups(row_group_meta) - .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]])) + .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]])) .set_offset_index(Some(vec![vec![ OffsetIndexMetaData::try_new(offset_index).unwrap() ]])) .build(); #[cfg(not(feature = "encryption"))] - let bigger_expected_size = 2784; + let bigger_expected_size = 2704; #[cfg(feature = "encryption")] - let bigger_expected_size = 3120; + let bigger_expected_size = 3040; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index a403f4eee8f0..97ea72ef964c 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -34,7 +34,7 @@ use bytes::Bytes; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData, RowGroupMetaData}; -use crate::file::page_index::index::Index; +use crate::file::page_index::index_reader::ColumnIndexMetaData; use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index}; use crate::file::reader::ChunkReader; use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER}; @@ -566,7 +566,7 @@ impl ParquetMetaDataReader { col_idx, ) } - None => Ok(Index::NONE), + None => Ok(ColumnIndexMetaData::NONE), }) .collect::>>() }) @@ -584,7 +584,7 @@ impl ParquetMetaDataReader { column: &ColumnChunkMetaData, row_group_index: usize, col_index: usize, - ) -> Result { + ) -> Result { match &column.column_crypto_metadata { Some(crypto_metadata) => { let file_decryptor = metadata.file_decryptor.as_ref().ok_or_else(|| { @@ -612,7 +612,7 @@ impl ParquetMetaDataReader { column: &ColumnChunkMetaData, _row_group_index: usize, _col_index: usize, - ) -> Result { + ) -> Result { decode_column_index(bytes, column.column_type()) } diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index acae20ec3cef..8c485f7d0e8b 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -24,9 +24,7 @@ use crate::encryption::{ }; #[cfg(feature = "encryption")] use crate::errors::ParquetError; -use crate::errors::Result; use crate::file::metadata::{KeyValue, ParquetMetaData}; -use crate::file::page_index::index::Index; use crate::file::writer::{get_file_magic, TrackedWrite}; use crate::format::EncryptionAlgorithm; #[cfg(feature = "encryption")] @@ -34,6 +32,7 @@ use crate::format::{AesGcmV1, ColumnCryptoMetaData}; use crate::schema::types; use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr}; use crate::thrift::TSerializable; +use crate::{errors::Result, file::page_index::index_reader::ColumnIndexMetaData}; use std::io::Write; use std::sync::Arc; use thrift::protocol::TCompactOutputProtocol; @@ -391,17 +390,31 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { column_indexes .iter() .map(|column_index| match column_index { - Index::NONE => None, - Index::BOOLEAN(column_index) => Some(column_index.to_thrift()), - Index::BYTE_ARRAY(column_index) => Some(column_index.to_thrift()), - Index::DOUBLE(column_index) => Some(column_index.to_thrift()), - Index::FIXED_LEN_BYTE_ARRAY(column_index) => { + ColumnIndexMetaData::NONE => None, + ColumnIndexMetaData::BOOLEAN(column_index) => { + Some(column_index.to_thrift()) + } + ColumnIndexMetaData::BYTE_ARRAY(column_index) => { + Some(column_index.to_thrift()) + } + ColumnIndexMetaData::DOUBLE(column_index) => { + Some(column_index.to_thrift()) + } + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => { + Some(column_index.to_thrift()) + } + ColumnIndexMetaData::FLOAT(column_index) => { + Some(column_index.to_thrift()) + } + ColumnIndexMetaData::INT32(column_index) => { + Some(column_index.to_thrift()) + } + ColumnIndexMetaData::INT64(column_index) => { + Some(column_index.to_thrift()) + } + ColumnIndexMetaData::INT96(column_index) => { Some(column_index.to_thrift()) } - Index::FLOAT(column_index) => Some(column_index.to_thrift()), - Index::INT32(column_index) => Some(column_index.to_thrift()), - Index::INT64(column_index) => Some(column_index.to_thrift()), - Index::INT96(column_index) => Some(column_index.to_thrift()), }) .collect() }) diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 1680f9ddc0ea..b030b61c4918 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -22,7 +22,6 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::Int96; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ColumnChunkMetaData; -use crate::file::page_index::index::Index; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::ChunkReader; use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; @@ -57,7 +56,7 @@ pub(crate) fn acc_range(a: Option>, b: Option>) -> Option< pub fn read_columns_indexes( reader: &R, chunks: &[ColumnChunkMetaData], -) -> Result>, ParquetError> { +) -> Result>, ParquetError> { let fetch = chunks .iter() .fold(None, |range, c| acc_range(range, c.column_index_range())); @@ -78,7 +77,7 @@ pub fn read_columns_indexes( ..usize::try_from(r.end - fetch.start)?], c.column_type(), ), - None => Ok(Index::NONE), + None => Ok(ColumnIndexMetaData::NONE), }) .collect(), ) @@ -150,12 +149,13 @@ pub(crate) struct ThriftColumnIndex<'a> { // TODO: the following should move to its own module /// Common bits of the column index +#[derive(Debug, Clone, PartialEq)] pub struct ColumnIndex { - null_pages: Vec, - boundary_order: BoundaryOrder, - null_counts: Option>, - repetition_level_histograms: Option>, - definition_level_histograms: Option>, + pub(crate) null_pages: Vec, + pub(crate) boundary_order: BoundaryOrder, + pub(crate) null_counts: Option>, + pub(crate) repetition_level_histograms: Option>, + pub(crate) definition_level_histograms: Option>, } impl ColumnIndex { @@ -198,10 +198,11 @@ impl ColumnIndex { } /// Column index for primitive types +#[derive(Debug, Clone, PartialEq)] pub struct PrimitiveColumnIndex { - column_index: ColumnIndex, - min_values: Vec, - max_values: Vec, + pub(crate) column_index: ColumnIndex, + pub(crate) min_values: Vec, + pub(crate) max_values: Vec, } impl PrimitiveColumnIndex { @@ -268,6 +269,35 @@ impl PrimitiveColumnIndex { Some(&self.max_values[idx]) } } + + pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { + let min_values = self + .min_values + .iter() + .map(|x| x.as_bytes().to_vec()) + .collect::>(); + + let max_values = self + .max_values + .iter() + .map(|x| x.as_bytes().to_vec()) + .collect::>(); + + let null_counts = self.null_counts.clone(); + let repetition_level_histograms = self.repetition_level_histograms.clone(); + let definition_level_histograms = self.definition_level_histograms.clone(); + let null_pages = self.null_pages.clone(); + + crate::format::ColumnIndex::new( + null_pages, + min_values, + max_values, + self.boundary_order.into(), + null_counts, + repetition_level_histograms, + definition_level_histograms, + ) + } } impl Deref for PrimitiveColumnIndex { @@ -279,13 +309,14 @@ impl Deref for PrimitiveColumnIndex { } /// Column index for byte arrays (fixed length and variable) +#[derive(Debug, Clone, PartialEq)] pub struct ByteArrayColumnIndex { - column_index: ColumnIndex, + pub(crate) column_index: ColumnIndex, // raw bytes for min and max values - min_bytes: Vec, - min_offsets: Vec, - max_bytes: Vec, - max_offsets: Vec, + pub(crate) min_bytes: Vec, + pub(crate) min_offsets: Vec, + pub(crate) max_bytes: Vec, + pub(crate) max_offsets: Vec, } impl ByteArrayColumnIndex { @@ -312,7 +343,7 @@ impl ByteArrayColumnIndex { min_pos += min.len(); let max = index.max_values[i]; - let dst = &mut max_bytes[max_pos..max_pos + min.len()]; + let dst = &mut max_bytes[max_pos..max_pos + max.len()]; dst.copy_from_slice(max); max_offsets[i] = max_pos; max_pos += max.len(); @@ -366,6 +397,33 @@ impl ByteArrayColumnIndex { Some(&self.max_bytes[start..end]) } } + + pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { + let mut min_values = Vec::with_capacity(self.num_pages() as usize); + for i in 0..self.num_pages() as usize { + min_values.push(self.min_value(i).unwrap_or(&vec![]).to_owned()); + } + + let mut max_values = Vec::with_capacity(self.num_pages() as usize); + for i in 0..self.num_pages() as usize { + max_values.push(self.max_value(i).unwrap_or(&vec![]).to_owned()); + } + + let null_counts = self.null_counts.clone(); + let repetition_level_histograms = self.repetition_level_histograms.clone(); + let definition_level_histograms = self.definition_level_histograms.clone(); + let null_pages = self.null_pages.clone(); + + crate::format::ColumnIndex::new( + null_pages, + min_values, + max_values, + self.boundary_order.into(), + null_counts, + repetition_level_histograms, + definition_level_histograms, + ) + } } impl Deref for ByteArrayColumnIndex { @@ -415,6 +473,7 @@ macro_rules! colidx_enum_func { } /// index +#[derive(Debug, Clone, PartialEq)] #[allow(non_camel_case_types)] pub enum ColumnIndexMetaData { /// Sometimes reading page index from parquet file @@ -491,11 +550,14 @@ impl ColumnIndexMetaData { } } -pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> Result { +pub(crate) fn decode_column_index( + data: &[u8], + column_type: Type, +) -> Result { let mut prot = ThriftCompactInputProtocol::new(data); let index = ThriftColumnIndex::try_from(&mut prot)?; - let _index = match column_type { + let index = match column_type { Type::BOOLEAN => { ColumnIndexMetaData::BOOLEAN(PrimitiveColumnIndex::::try_new(index)?) } @@ -510,6 +572,5 @@ pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> Resultbool_col:BOOLEAN UNCOMPRESSED DO:0 FPO:37329 SZ:3022/3022/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: false, max: true, num_nulls: 0] assert!(&column_index[0][1].is_sorted()); - if let Index::BOOLEAN(index) = &column_index[0][1] { - assert_eq!(index.indexes.len(), 82); + if let ColumnIndexMetaData::BOOLEAN(index) = &column_index[0][1] { + assert_eq!(index.num_pages(), 82); assert_eq!(row_group_offset_indexes[1].page_locations.len(), 82); } else { unreachable!() }; //col2->tinyint_col: INT32 UNCOMPRESSED DO:0 FPO:40351 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] assert!(&column_index[0][2].is_sorted()); - if let Index::INT32(index) = &column_index[0][2] { + if let ColumnIndexMetaData::INT32(index) = &column_index[0][2] { check_native_page_index( index, 325, @@ -2025,7 +2025,7 @@ mod tests { }; //col4->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] assert!(&column_index[0][3].is_sorted()); - if let Index::INT32(index) = &column_index[0][3] { + if let ColumnIndexMetaData::INT32(index) = &column_index[0][3] { check_native_page_index( index, 325, @@ -2038,7 +2038,7 @@ mod tests { }; //col5->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] assert!(&column_index[0][4].is_sorted()); - if let Index::INT32(index) = &column_index[0][4] { + if let ColumnIndexMetaData::INT32(index) = &column_index[0][4] { check_native_page_index( index, 325, @@ -2051,7 +2051,7 @@ mod tests { }; //col6->bigint_col: INT64 UNCOMPRESSED DO:0 FPO:152326 SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 90, num_nulls: 0] assert!(!&column_index[0][5].is_sorted()); - if let Index::INT64(index) = &column_index[0][5] { + if let ColumnIndexMetaData::INT64(index) = &column_index[0][5] { check_native_page_index( index, 528, @@ -2064,7 +2064,7 @@ mod tests { }; //col7->float_col: FLOAT UNCOMPRESSED DO:0 FPO:223924 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 9.9, num_nulls: 0] assert!(&column_index[0][6].is_sorted()); - if let Index::FLOAT(index) = &column_index[0][6] { + if let ColumnIndexMetaData::FLOAT(index) = &column_index[0][6] { check_native_page_index( index, 325, @@ -2077,7 +2077,7 @@ mod tests { }; //col8->double_col: DOUBLE UNCOMPRESSED DO:0 FPO:261249 SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 90.89999999999999, num_nulls: 0] assert!(!&column_index[0][7].is_sorted()); - if let Index::DOUBLE(index) = &column_index[0][7] { + if let ColumnIndexMetaData::DOUBLE(index) = &column_index[0][7] { check_native_page_index( index, 528, @@ -2090,8 +2090,8 @@ mod tests { }; //col9->date_string_col: BINARY UNCOMPRESSED DO:0 FPO:332847 SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 01/01/09, max: 12/31/10, num_nulls: 0] assert!(!&column_index[0][8].is_sorted()); - if let Index::BYTE_ARRAY(index) = &column_index[0][8] { - check_native_page_index( + if let ColumnIndexMetaData::BYTE_ARRAY(index) = &column_index[0][8] { + check_byte_array_page_index( index, 974, get_row_group_min_max_bytes(row_group_metadata, 8), @@ -2103,8 +2103,8 @@ mod tests { }; //col10->string_col: BINARY UNCOMPRESSED DO:0 FPO:444795 SZ:45298/45298/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] assert!(&column_index[0][9].is_sorted()); - if let Index::BYTE_ARRAY(index) = &column_index[0][9] { - check_native_page_index( + if let ColumnIndexMetaData::BYTE_ARRAY(index) = &column_index[0][9] { + check_byte_array_page_index( index, 352, get_row_group_min_max_bytes(row_group_metadata, 9), @@ -2117,14 +2117,14 @@ mod tests { //col11->timestamp_col: INT96 UNCOMPRESSED DO:0 FPO:490093 SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 0, min/max not defined] //Notice: min_max values for each page for this col not exits. assert!(!&column_index[0][10].is_sorted()); - if let Index::NONE = &column_index[0][10] { + if let ColumnIndexMetaData::NONE = &column_index[0][10] { assert_eq!(row_group_offset_indexes[10].page_locations.len(), 974); } else { unreachable!() }; //col12->year: INT32 UNCOMPRESSED DO:0 FPO:602041 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 2009, max: 2010, num_nulls: 0] assert!(&column_index[0][11].is_sorted()); - if let Index::INT32(index) = &column_index[0][11] { + if let ColumnIndexMetaData::INT32(index) = &column_index[0][11] { check_native_page_index( index, 325, @@ -2137,7 +2137,7 @@ mod tests { }; //col13->month: INT32 UNCOMPRESSED DO:0 FPO:639366 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 1, max: 12, num_nulls: 0] assert!(!&column_index[0][12].is_sorted()); - if let Index::INT32(index) = &column_index[0][12] { + if let ColumnIndexMetaData::INT32(index) = &column_index[0][12] { check_native_page_index( index, 325, @@ -2151,17 +2151,31 @@ mod tests { } fn check_native_page_index( - row_group_index: &NativeIndex, + row_group_index: &PrimitiveColumnIndex, page_size: usize, min_max: (&[u8], &[u8]), boundary_order: BoundaryOrder, ) { - assert_eq!(row_group_index.indexes.len(), page_size); + assert_eq!(row_group_index.num_pages() as usize, page_size); assert_eq!(row_group_index.boundary_order, boundary_order); - row_group_index.indexes.iter().all(|x| { - x.min.as_ref().unwrap() >= &T::try_from_le_slice(min_max.0).unwrap() - && x.max.as_ref().unwrap() <= &T::try_from_le_slice(min_max.1).unwrap() - }); + assert!(row_group_index.min_values().iter().all(|x| { + x >= &T::try_from_le_slice(min_max.0).unwrap() + && x <= &T::try_from_le_slice(min_max.1).unwrap() + })); + } + + fn check_byte_array_page_index( + row_group_index: &ByteArrayColumnIndex, + page_size: usize, + min_max: (&[u8], &[u8]), + boundary_order: BoundaryOrder, + ) { + assert_eq!(row_group_index.num_pages() as usize, page_size); + assert_eq!(row_group_index.boundary_order, boundary_order); + for i in 0..row_group_index.num_pages() as usize { + let x = row_group_index.min_value(i).unwrap(); + assert!(x >= min_max.0 && x <= min_max.1); + } } fn get_row_group_min_max_bytes(r: &RowGroupMetaData, col_num: usize) -> (&[u8], &[u8]) { @@ -2402,12 +2416,11 @@ mod tests { assert_eq!(c.len(), 1); match &c[0] { - Index::FIXED_LEN_BYTE_ARRAY(v) => { - assert_eq!(v.indexes.len(), 1); - let page_idx = &v.indexes[0]; - assert_eq!(page_idx.null_count.unwrap(), 1); - assert_eq!(page_idx.min.as_ref().unwrap().as_ref(), &[0; 11]); - assert_eq!(page_idx.max.as_ref().unwrap().as_ref(), &[5; 11]); + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(v) => { + assert_eq!(v.num_pages(), 1); + assert_eq!(v.null_count(0).unwrap(), 1); + assert_eq!(v.min_value(0).unwrap(), &[0; 11]); + assert_eq!(v.max_value(0).unwrap(), &[5; 11]); } _ => unreachable!(), } @@ -2538,11 +2551,11 @@ mod tests { // test that we got the index matching the row group match pg_idx { - Index::INT32(int_idx) => { + ColumnIndexMetaData::INT32(int_idx) => { let min = col_stats.min_bytes_opt().unwrap().get_i32_le(); let max = col_stats.max_bytes_opt().unwrap().get_i32_le(); - assert_eq!(int_idx.indexes[0].min(), Some(min).as_ref()); - assert_eq!(int_idx.indexes[0].max(), Some(max).as_ref()); + assert_eq!(int_idx.min_value(0), Some(min).as_ref()); + assert_eq!(int_idx.max_value(0), Some(max).as_ref()); } _ => panic!("wrong stats type"), } @@ -2583,11 +2596,11 @@ mod tests { // test that we got the index matching the row group match pg_idx { - Index::INT32(int_idx) => { + ColumnIndexMetaData::INT32(int_idx) => { let min = col_stats.min_bytes_opt().unwrap().get_i32_le(); let max = col_stats.max_bytes_opt().unwrap().get_i32_le(); - assert_eq!(int_idx.indexes[0].min(), Some(min).as_ref()); - assert_eq!(int_idx.indexes[0].max(), Some(max).as_ref()); + assert_eq!(int_idx.min_value(0), Some(min).as_ref()); + assert_eq!(int_idx.max_value(0), Some(max).as_ref()); } _ => panic!("wrong stats type"), } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 7db517ced5b2..1808e88878e7 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1063,6 +1063,7 @@ mod tests { use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; use crate::data_type::{BoolType, ByteArrayType, Int32Type}; use crate::file::page_index::index::Index; + use crate::file::page_index::index_reader::ColumnIndexMetaData; use crate::file::properties::EnabledStatistics; use crate::file::serialized_reader::ReadOptionsBuilder; use crate::file::{ @@ -2083,9 +2084,9 @@ mod tests { assert_eq!(column_index[0].len(), 2); // 2 column let a_idx = &column_index[0][0]; - assert!(matches!(a_idx, Index::INT32(_)), "{a_idx:?}"); + assert!(matches!(a_idx, ColumnIndexMetaData::INT32(_)), "{a_idx:?}"); let b_idx = &column_index[0][1]; - assert!(matches!(b_idx, Index::NONE), "{b_idx:?}"); + assert!(matches!(b_idx, ColumnIndexMetaData::NONE), "{b_idx:?}"); } #[test] @@ -2169,16 +2170,16 @@ mod tests { let column_index = reader.metadata().column_index().unwrap(); assert_eq!(column_index.len(), 1); assert_eq!(column_index[0].len(), 1); - let col_idx = if let Index::BYTE_ARRAY(index) = &column_index[0][0] { - assert_eq!(index.indexes.len(), 1); - &index.indexes[0] + let col_idx = if let ColumnIndexMetaData::BYTE_ARRAY(index) = &column_index[0][0] { + assert_eq!(index.num_pages(), 1); + index } else { unreachable!() }; - assert!(col_idx.repetition_level_histogram().is_none()); - assert!(col_idx.definition_level_histogram().is_some()); - check_def_hist(col_idx.definition_level_histogram().unwrap().values()); + assert!(col_idx.repetition_level_histogram(0).is_none()); + assert!(col_idx.definition_level_histogram(0).is_some()); + check_def_hist(col_idx.definition_level_histogram(0).unwrap()); assert!(reader.metadata().offset_index().is_some()); let offset_index = reader.metadata().offset_index().unwrap(); @@ -2324,15 +2325,15 @@ mod tests { let column_index = reader.metadata().column_index().unwrap(); assert_eq!(column_index.len(), 1); assert_eq!(column_index[0].len(), 1); - let col_idx = if let Index::INT32(index) = &column_index[0][0] { - assert_eq!(index.indexes.len(), 1); - &index.indexes[0] + let col_idx = if let ColumnIndexMetaData::INT32(index) = &column_index[0][0] { + assert_eq!(index.num_pages(), 1); + index } else { unreachable!() }; - check_def_hist(col_idx.definition_level_histogram().unwrap().values()); - check_rep_hist(col_idx.repetition_level_histogram().unwrap().values()); + check_def_hist(col_idx.definition_level_histogram(0).unwrap()); + check_rep_hist(col_idx.repetition_level_histogram(0).unwrap()); assert!(reader.metadata().offset_index().is_some()); let offset_index = reader.metadata().offset_index().unwrap(); diff --git a/parquet/tests/encryption/encryption_util.rs b/parquet/tests/encryption/encryption_util.rs index bf7fd08109f6..549bdec47343 100644 --- a/parquet/tests/encryption/encryption_util.rs +++ b/parquet/tests/encryption/encryption_util.rs @@ -191,11 +191,11 @@ pub(crate) fn verify_column_indexes(metadata: &ParquetMetaData) { let column_index = &column_index[0][float_col_idx]; match column_index { - parquet::file::page_index::index::Index::FLOAT(float_index) => { - assert_eq!(float_index.indexes.len(), 1); - assert_eq!(float_index.indexes[0].min, Some(0.0f32)); - assert!(float_index.indexes[0] - .max + parquet::file::page_index::index_reader::ColumnIndexMetaData::FLOAT(float_index) => { + assert_eq!(float_index.num_pages(), 1); + assert_eq!(float_index.min_value(0), Some(&0.0f32)); + assert!(float_index + .max_value(0) .is_some_and(|max| (max - 53.9).abs() < 1e-6)); } _ => { From 5ee1b8f8e1fac74f462eda2d7481833fc2c976d3 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 21 Aug 2025 14:57:56 -0700 Subject: [PATCH 008/126] fix bug found in testing --- parquet/src/file/page_index/index_reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 1680f9ddc0ea..3dc5a8d2dc18 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -312,7 +312,7 @@ impl ByteArrayColumnIndex { min_pos += min.len(); let max = index.max_values[i]; - let dst = &mut max_bytes[max_pos..max_pos + min.len()]; + let dst = &mut max_bytes[max_pos..max_pos + max.len()]; dst.copy_from_slice(max); max_offsets[i] = max_pos; max_pos += max.len(); From d99a06acf077906dc1f9611757d35780b7a15b38 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 21 Aug 2025 17:03:26 -0700 Subject: [PATCH 009/126] stats converter works --- parquet/src/arrow/arrow_reader/statistics.rs | 233 +++++++++---------- parquet/src/file/page_index/index_reader.rs | 84 ++++++- 2 files changed, 188 insertions(+), 129 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index eba1f561203c..b719d81fe0a1 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -25,7 +25,7 @@ use crate::basic::Type as PhysicalType; use crate::data_type::{ByteArray, FixedLenByteArray}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData}; -use crate::file::page_index::index::{Index, PageIndex}; +use crate::file::page_index::index_reader::ColumnIndexMetaData; use crate::file::statistics::Statistics as ParquetStatistics; use crate::schema::types::SchemaDescriptor; use arrow_array::builder::{ @@ -597,17 +597,17 @@ macro_rules! get_statistics { } macro_rules! make_data_page_stats_iterator { - ($iterator_type: ident, $func: expr, $index_type: path, $stat_value_type: ty) => { + ($iterator_type: ident, $func: ident, $index_type: path, $stat_value_type: ty, $conv:expr) => { struct $iterator_type<'a, I> where - I: Iterator, + I: Iterator, { iter: I, } impl<'a, I> $iterator_type<'a, I> where - I: Iterator, + I: Iterator, { fn new(iter: I) -> Self { Self { iter } @@ -616,7 +616,7 @@ macro_rules! make_data_page_stats_iterator { impl<'a, I> Iterator for $iterator_type<'a, I> where - I: Iterator, + I: Iterator, { type Item = Vec>; @@ -624,9 +624,12 @@ macro_rules! make_data_page_stats_iterator { let next = self.iter.next(); match next { Some((len, index)) => match index { - $index_type(native_index) => { - Some(native_index.indexes.iter().map($func).collect::>()) - } + $index_type(native_index) => Some( + native_index + .$func() + .map(|v| v.map($conv)) + .collect::>(), + ), // No matching `Index` found; // thus no statistics that can be extracted. // We return vec![None; len] to effectively @@ -648,114 +651,130 @@ macro_rules! make_data_page_stats_iterator { make_data_page_stats_iterator!( MinBooleanDataPageStatsIterator, - |x: &PageIndex| { x.min }, - Index::BOOLEAN, - bool + min_values_iter, + ColumnIndexMetaData::BOOLEAN, + bool, + |m| m.clone() ); make_data_page_stats_iterator!( MaxBooleanDataPageStatsIterator, - |x: &PageIndex| { x.max }, - Index::BOOLEAN, - bool + max_values_iter, + ColumnIndexMetaData::BOOLEAN, + bool, + |m| m.clone() ); make_data_page_stats_iterator!( MinInt32DataPageStatsIterator, - |x: &PageIndex| { x.min }, - Index::INT32, - i32 + min_values_iter, + ColumnIndexMetaData::INT32, + i32, + |m| m.clone() ); make_data_page_stats_iterator!( MaxInt32DataPageStatsIterator, - |x: &PageIndex| { x.max }, - Index::INT32, - i32 + max_values_iter, + ColumnIndexMetaData::INT32, + i32, + |m| m.clone() ); make_data_page_stats_iterator!( MinInt64DataPageStatsIterator, - |x: &PageIndex| { x.min }, - Index::INT64, - i64 + min_values_iter, + ColumnIndexMetaData::INT64, + i64, + |m| m.clone() ); make_data_page_stats_iterator!( MaxInt64DataPageStatsIterator, - |x: &PageIndex| { x.max }, - Index::INT64, - i64 + max_values_iter, + ColumnIndexMetaData::INT64, + i64, + |m| m.clone() ); make_data_page_stats_iterator!( MinFloat16DataPageStatsIterator, - |x: &PageIndex| { x.min.clone() }, - Index::FIXED_LEN_BYTE_ARRAY, - FixedLenByteArray + min_values_iter, + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY, + FixedLenByteArray, + |m| FixedLenByteArray::from(m.to_owned()) ); make_data_page_stats_iterator!( MaxFloat16DataPageStatsIterator, - |x: &PageIndex| { x.max.clone() }, - Index::FIXED_LEN_BYTE_ARRAY, - FixedLenByteArray + max_values_iter, + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY, + FixedLenByteArray, + |m| FixedLenByteArray::from(m.to_owned()) ); make_data_page_stats_iterator!( MinFloat32DataPageStatsIterator, - |x: &PageIndex| { x.min }, - Index::FLOAT, - f32 + min_values_iter, + ColumnIndexMetaData::FLOAT, + f32, + |m| m.clone() ); make_data_page_stats_iterator!( MaxFloat32DataPageStatsIterator, - |x: &PageIndex| { x.max }, - Index::FLOAT, - f32 + max_values_iter, + ColumnIndexMetaData::FLOAT, + f32, + |m| m.clone() ); make_data_page_stats_iterator!( MinFloat64DataPageStatsIterator, - |x: &PageIndex| { x.min }, - Index::DOUBLE, - f64 + min_values_iter, + ColumnIndexMetaData::DOUBLE, + f64, + |m| m.clone() ); make_data_page_stats_iterator!( MaxFloat64DataPageStatsIterator, - |x: &PageIndex| { x.max }, - Index::DOUBLE, - f64 + max_values_iter, + ColumnIndexMetaData::DOUBLE, + f64, + |m| m.clone() ); make_data_page_stats_iterator!( MinByteArrayDataPageStatsIterator, - |x: &PageIndex| { x.min.clone() }, - Index::BYTE_ARRAY, - ByteArray + min_values_iter, + ColumnIndexMetaData::BYTE_ARRAY, + ByteArray, + |m| ByteArray::from(m.to_owned()) ); make_data_page_stats_iterator!( MaxByteArrayDataPageStatsIterator, - |x: &PageIndex| { x.max.clone() }, - Index::BYTE_ARRAY, - ByteArray + max_values_iter, + ColumnIndexMetaData::BYTE_ARRAY, + ByteArray, + |m| ByteArray::from(m.to_owned()) ); make_data_page_stats_iterator!( MaxFixedLenByteArrayDataPageStatsIterator, - |x: &PageIndex| { x.max.clone() }, - Index::FIXED_LEN_BYTE_ARRAY, - FixedLenByteArray + max_values_iter, + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY, + FixedLenByteArray, + |m| FixedLenByteArray::from(m.to_owned()) ); make_data_page_stats_iterator!( MinFixedLenByteArrayDataPageStatsIterator, - |x: &PageIndex| { x.min.clone() }, - Index::FIXED_LEN_BYTE_ARRAY, - FixedLenByteArray + min_values_iter, + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY, + FixedLenByteArray, + |m| FixedLenByteArray::from(m.to_owned()) ); macro_rules! get_decimal_page_stats_iterator { ($iterator_type: ident, $func: ident, $stat_value_type: ident, $convert_func: ident) => { struct $iterator_type<'a, I> where - I: Iterator, + I: Iterator, { iter: I, } impl<'a, I> $iterator_type<'a, I> where - I: Iterator, + I: Iterator, { fn new(iter: I) -> Self { Self { iter } @@ -764,44 +783,37 @@ macro_rules! get_decimal_page_stats_iterator { impl<'a, I> Iterator for $iterator_type<'a, I> where - I: Iterator, + I: Iterator, { type Item = Vec>; + // Some(native_index.$func().map(|v| v.map($conv)).collect::>()) fn next(&mut self) -> Option { let next = self.iter.next(); match next { Some((len, index)) => match index { - Index::INT32(native_index) => Some( + ColumnIndexMetaData::INT32(native_index) => Some( native_index - .indexes - .iter() - .map(|x| x.$func.and_then(|x| Some($stat_value_type::from(x)))) + .$func() + .map(|x| x.map(|x| $stat_value_type::from(*x))) .collect::>(), ), - Index::INT64(native_index) => Some( + ColumnIndexMetaData::INT64(native_index) => Some( native_index - .indexes - .iter() - .map(|x| x.$func.and_then(|x| $stat_value_type::try_from(x).ok())) + .$func() + .map(|x| x.map(|x| $stat_value_type::try_from(*x).unwrap())) .collect::>(), ), - Index::BYTE_ARRAY(native_index) => Some( + ColumnIndexMetaData::BYTE_ARRAY(native_index) => Some( native_index - .indexes - .iter() - .map(|x| { - x.clone().$func.and_then(|x| Some($convert_func(x.data()))) - }) + .$func() + .map(|x| x.map(|x| $convert_func(x))) .collect::>(), ), - Index::FIXED_LEN_BYTE_ARRAY(native_index) => Some( + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(native_index) => Some( native_index - .indexes - .iter() - .map(|x| { - x.clone().$func.and_then(|x| Some($convert_func(x.data()))) - }) + .$func() + .map(|x| x.map(|x| $convert_func(x))) .collect::>(), ), _ => Some(vec![None; len]), @@ -819,56 +831,56 @@ macro_rules! get_decimal_page_stats_iterator { get_decimal_page_stats_iterator!( MinDecimal32DataPageStatsIterator, - min, + min_values_iter, i32, from_bytes_to_i32 ); get_decimal_page_stats_iterator!( MaxDecimal32DataPageStatsIterator, - max, + max_values_iter, i32, from_bytes_to_i32 ); get_decimal_page_stats_iterator!( MinDecimal64DataPageStatsIterator, - min, + min_values_iter, i64, from_bytes_to_i64 ); get_decimal_page_stats_iterator!( MaxDecimal64DataPageStatsIterator, - max, + max_values_iter, i64, from_bytes_to_i64 ); get_decimal_page_stats_iterator!( MinDecimal128DataPageStatsIterator, - min, + min_values_iter, i128, from_bytes_to_i128 ); get_decimal_page_stats_iterator!( MaxDecimal128DataPageStatsIterator, - max, + max_values_iter, i128, from_bytes_to_i128 ); get_decimal_page_stats_iterator!( MinDecimal256DataPageStatsIterator, - min, + min_values_iter, i256, from_bytes_to_i256 ); get_decimal_page_stats_iterator!( MaxDecimal256DataPageStatsIterator, - max, + max_values_iter, i256, from_bytes_to_i256 ); @@ -1181,7 +1193,7 @@ pub(crate) fn min_page_statistics<'a, I>( physical_type: Option, ) -> Result where - I: Iterator, + I: Iterator, { get_data_page_statistics!(Min, data_type, iterator, physical_type) } @@ -1194,7 +1206,7 @@ pub(crate) fn max_page_statistics<'a, I>( physical_type: Option, ) -> Result where - I: Iterator, + I: Iterator, { get_data_page_statistics!(Max, data_type, iterator, physical_type) } @@ -1205,46 +1217,13 @@ where /// The returned Array is an [`UInt64Array`] pub(crate) fn null_counts_page_statistics<'a, I>(iterator: I) -> Result where - I: Iterator, + I: Iterator, { let iter = iterator.flat_map(|(len, index)| match index { - Index::NONE => vec![None; len], - Index::BOOLEAN(native_index) => native_index - .indexes - .iter() - .map(|x| x.null_count.map(|x| x as u64)) - .collect::>(), - Index::INT32(native_index) => native_index - .indexes - .iter() - .map(|x| x.null_count.map(|x| x as u64)) - .collect::>(), - Index::INT64(native_index) => native_index - .indexes - .iter() - .map(|x| x.null_count.map(|x| x as u64)) - .collect::>(), - Index::FLOAT(native_index) => native_index - .indexes - .iter() - .map(|x| x.null_count.map(|x| x as u64)) - .collect::>(), - Index::DOUBLE(native_index) => native_index - .indexes - .iter() - .map(|x| x.null_count.map(|x| x as u64)) - .collect::>(), - Index::FIXED_LEN_BYTE_ARRAY(native_index) => native_index - .indexes - .iter() - .map(|x| x.null_count.map(|x| x as u64)) - .collect::>(), - Index::BYTE_ARRAY(native_index) => native_index - .indexes - .iter() - .map(|x| x.null_count.map(|x| x as u64)) - .collect::>(), - _ => unimplemented!(), + ColumnIndexMetaData::NONE => vec![None; len], + column_index => column_index.null_counts().map_or(vec![None; len], |v| { + v.iter().map(|i| Some(*i as u64)).collect::>() + }), }); Ok(UInt64Array::from_iter(iter)) diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index b030b61c4918..f13ac2aab407 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -165,6 +165,8 @@ impl ColumnIndex { } /// Returns the number of null values in the page indexed by `idx` + /// + /// Returns `None` if no null counts have been set in the index pub fn null_count(&self, idx: usize) -> Option { self.null_counts.as_ref().map(|nc| nc[idx]) } @@ -220,6 +222,7 @@ impl PrimitiveColumnIndex { let max = index.max_values[i]; max_values.push(T::try_from_le_slice(max)?); } else { + // need placeholders min_values.push(Default::default()); max_values.push(Default::default()); } @@ -238,16 +241,48 @@ impl PrimitiveColumnIndex { }) } - /// Returns an array containing the min values for each page + /// Returns an array containing the min values for each page. + /// + /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`] + /// is `false` for the same index. pub fn min_values(&self) -> &[T] { &self.min_values } - /// Returns an array containing the max values for each page + /// Returns an array containing the max values for each page. + /// + /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`] + /// is `false` for the same index. pub fn max_values(&self) -> &[T] { &self.max_values } + /// Returns an iterator over the min values. + /// + /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. + pub fn min_values_iter(&self) -> impl Iterator> { + self.min_values.iter().enumerate().map(|(i, min)| { + if self.is_null_page(i) { + None + } else { + Some(min) + } + }) + } + + /// Returns an iterator over the max values. + /// + /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. + pub fn max_values_iter(&self) -> impl Iterator> { + self.max_values.iter().enumerate().map(|(i, min)| { + if self.is_null_page(i) { + None + } else { + Some(min) + } + }) + } + /// Returns the min value for the page indexed by `idx` /// /// It is `None` when all values are null @@ -398,6 +433,32 @@ impl ByteArrayColumnIndex { } } + /// Returns an iterator over the min values. + /// + /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. + pub fn min_values_iter(&self) -> impl Iterator> { + (0..self.num_pages() as usize).into_iter().map(|i| { + if self.is_null_page(i) { + None + } else { + self.min_value(i) + } + }) + } + + /// Returns an iterator over the max values. + /// + /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. + pub fn max_values_iter(&self) -> impl Iterator> { + (0..self.num_pages() as usize).into_iter().map(|i| { + if self.is_null_page(i) { + None + } else { + self.max_value(i) + } + }) + } + pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { let mut min_values = Vec::with_capacity(self.num_pages() as usize); for i in 0..self.num_pages() as usize { @@ -524,12 +585,31 @@ impl ColumnIndexMetaData { } } + /// Returns array of null counts, one per page. + /// + /// Returns `None` if now null counts have been set in the index + pub fn null_counts(&self) -> Option<&Vec> { + match self { + Self::NONE => None, + Self::BOOLEAN(index) => index.null_counts.as_ref(), + Self::INT32(index) => index.null_counts.as_ref(), + Self::INT64(index) => index.null_counts.as_ref(), + Self::INT96(index) => index.null_counts.as_ref(), + Self::FLOAT(index) => index.null_counts.as_ref(), + Self::DOUBLE(index) => index.null_counts.as_ref(), + Self::BYTE_ARRAY(index) => index.null_counts.as_ref(), + Self::FIXED_LEN_BYTE_ARRAY(index) => index.null_counts.as_ref(), + } + } + /// Returns the number of pages pub fn num_pages(&self) -> u64 { colidx_enum_func!(self, num_pages) } /// Returns the number of null values in the page indexed by `idx` + /// + /// Returns `None` if no null counts have been set in the index pub fn null_count(&self, idx: usize) -> Option { colidx_enum_func!(self, null_count, idx) } From 79a6917efb7a8123329450de21ecce461729f796 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 21 Aug 2025 17:10:03 -0700 Subject: [PATCH 010/126] get rid of import --- parquet/src/file/writer.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 1808e88878e7..d0101aa84a35 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1062,7 +1062,6 @@ mod tests { use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; use crate::data_type::{BoolType, ByteArrayType, Int32Type}; - use crate::file::page_index::index::Index; use crate::file::page_index::index_reader::ColumnIndexMetaData; use crate::file::properties::EnabledStatistics; use crate::file::serialized_reader::ReadOptionsBuilder; From 878d4607dc3ac28d3b0e00b2b5647951cb48e329 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 21 Aug 2025 20:05:42 -0700 Subject: [PATCH 011/126] get parquet-index working --- parquet/src/arrow/arrow_reader/statistics.rs | 20 ++--- parquet/src/bin/parquet-index.rs | 89 ++++++++++++++++---- parquet/src/file/page_index/index_reader.rs | 72 ++++++++-------- 3 files changed, 118 insertions(+), 63 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index b719d81fe0a1..68dd36d0437a 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -654,42 +654,42 @@ make_data_page_stats_iterator!( min_values_iter, ColumnIndexMetaData::BOOLEAN, bool, - |m| m.clone() + |m| *m ); make_data_page_stats_iterator!( MaxBooleanDataPageStatsIterator, max_values_iter, ColumnIndexMetaData::BOOLEAN, bool, - |m| m.clone() + |m| *m ); make_data_page_stats_iterator!( MinInt32DataPageStatsIterator, min_values_iter, ColumnIndexMetaData::INT32, i32, - |m| m.clone() + |m| *m ); make_data_page_stats_iterator!( MaxInt32DataPageStatsIterator, max_values_iter, ColumnIndexMetaData::INT32, i32, - |m| m.clone() + |m| *m ); make_data_page_stats_iterator!( MinInt64DataPageStatsIterator, min_values_iter, ColumnIndexMetaData::INT64, i64, - |m| m.clone() + |m| *m ); make_data_page_stats_iterator!( MaxInt64DataPageStatsIterator, max_values_iter, ColumnIndexMetaData::INT64, i64, - |m| m.clone() + |m| *m ); make_data_page_stats_iterator!( MinFloat16DataPageStatsIterator, @@ -710,28 +710,28 @@ make_data_page_stats_iterator!( min_values_iter, ColumnIndexMetaData::FLOAT, f32, - |m| m.clone() + |m| *m ); make_data_page_stats_iterator!( MaxFloat32DataPageStatsIterator, max_values_iter, ColumnIndexMetaData::FLOAT, f32, - |m| m.clone() + |m| *m ); make_data_page_stats_iterator!( MinFloat64DataPageStatsIterator, min_values_iter, ColumnIndexMetaData::DOUBLE, f64, - |m| m.clone() + |m| *m ); make_data_page_stats_iterator!( MaxFloat64DataPageStatsIterator, max_values_iter, ColumnIndexMetaData::DOUBLE, f64, - |m| m.clone() + |m| *m ); make_data_page_stats_iterator!( MinByteArrayDataPageStatsIterator, diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index e91f5e5a9f17..161a1507a146 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -35,8 +35,11 @@ //! [page index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md use clap::Parser; +use parquet::data_type::ByteArray; use parquet::errors::{ParquetError, Result}; -use parquet::file::page_index::index::{Index, PageIndex}; +use parquet::file::page_index::index_reader::{ + ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex, +}; use parquet::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use parquet::file::reader::{FileReader, SerializedFileReader}; use parquet::file::serialized_reader::ReadOptionsBuilder; @@ -96,16 +99,20 @@ impl Args { let row_counts = compute_row_counts(offset_index.page_locations.as_slice(), row_group.num_rows()); match &column_indices[column_idx] { - Index::NONE => println!("NO INDEX"), - Index::BOOLEAN(v) => print_index(&v.indexes, offset_index, &row_counts)?, - Index::INT32(v) => print_index(&v.indexes, offset_index, &row_counts)?, - Index::INT64(v) => print_index(&v.indexes, offset_index, &row_counts)?, - Index::INT96(v) => print_index(&v.indexes, offset_index, &row_counts)?, - Index::FLOAT(v) => print_index(&v.indexes, offset_index, &row_counts)?, - Index::DOUBLE(v) => print_index(&v.indexes, offset_index, &row_counts)?, - Index::BYTE_ARRAY(v) => print_index(&v.indexes, offset_index, &row_counts)?, - Index::FIXED_LEN_BYTE_ARRAY(v) => { - print_index(&v.indexes, offset_index, &row_counts)? + ColumnIndexMetaData::NONE => println!("NO INDEX"), + ColumnIndexMetaData::BOOLEAN(v) => { + print_index::(v, offset_index, &row_counts)? + } + ColumnIndexMetaData::INT32(v) => print_index(v, offset_index, &row_counts)?, + ColumnIndexMetaData::INT64(v) => print_index(v, offset_index, &row_counts)?, + ColumnIndexMetaData::INT96(v) => print_index(v, offset_index, &row_counts)?, + ColumnIndexMetaData::FLOAT(v) => print_index(v, offset_index, &row_counts)?, + ColumnIndexMetaData::DOUBLE(v) => print_index(v, offset_index, &row_counts)?, + ColumnIndexMetaData::BYTE_ARRAY(v) => { + print_bytes_index(v, offset_index, &row_counts)? + } + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(v) => { + print_bytes_index(v, offset_index, &row_counts)? } } } @@ -131,20 +138,21 @@ fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec { /// Prints index information for a single column chunk fn print_index( - column_index: &[PageIndex], + column_index: &PrimitiveColumnIndex, offset_index: &OffsetIndexMetaData, row_counts: &[i64], ) -> Result<()> { - if column_index.len() != offset_index.page_locations.len() { + if column_index.num_pages() as usize != offset_index.page_locations.len() { return Err(ParquetError::General(format!( "Index length mismatch, got {} and {}", - column_index.len(), + column_index.num_pages(), offset_index.page_locations.len() ))); } - for (idx, ((c, o), row_count)) in column_index - .iter() + for (idx, (((min, max), o), row_count)) in column_index + .min_values_iter() + .zip(column_index.max_values_iter()) .zip(offset_index.page_locations()) .zip(row_counts) .enumerate() @@ -153,12 +161,12 @@ fn print_index( "Page {:>5} at offset {:#010x} with length {:>10} and row count {:>10}", idx, o.offset, o.compressed_page_size, row_count ); - match &c.min { + match min { Some(m) => print!(", min {m:>10}"), None => print!(", min {:>10}", "NONE"), } - match &c.max { + match max { Some(m) => print!(", max {m:>10}"), None => print!(", max {:>10}", "NONE"), } @@ -168,6 +176,51 @@ fn print_index( Ok(()) } +fn print_bytes_index( + column_index: &ByteArrayColumnIndex, + offset_index: &OffsetIndexMetaData, + row_counts: &[i64], +) -> Result<()> { + if column_index.num_pages() as usize != offset_index.page_locations.len() { + return Err(ParquetError::General(format!( + "Index length mismatch, got {} and {}", + column_index.num_pages(), + offset_index.page_locations.len() + ))); + } + + for (idx, (((min, max), o), row_count)) in column_index + .min_values_iter() + .zip(column_index.max_values_iter()) + .zip(offset_index.page_locations()) + .zip(row_counts) + .enumerate() + { + print!( + "Page {:>5} at offset {:#010x} with length {:>10} and row count {:>10}", + idx, o.offset, o.compressed_page_size, row_count + ); + match min { + Some(m) => match String::from_utf8(m.to_vec()) { + Ok(s) => print!(", min {s:>10}"), + Err(_) => print!(", min {:>10}", ByteArray::from(m)), + }, + None => print!(", min {:>10}", "NONE"), + } + + match max { + Some(m) => match String::from_utf8(m.to_vec()) { + Ok(s) => print!(", max {s:>10}"), + Err(_) => print!(", min {:>10}", ByteArray::from(m)), + }, + None => print!(", max {:>10}", "NONE"), + } + println!() + } + + Ok(()) +} + fn main() -> Result<()> { Args::parse().run() } diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index f13ac2aab407..d37ee789f728 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -201,7 +201,7 @@ impl ColumnIndex { /// Column index for primitive types #[derive(Debug, Clone, PartialEq)] -pub struct PrimitiveColumnIndex { +pub struct PrimitiveColumnIndex { pub(crate) column_index: ColumnIndex, pub(crate) min_values: Vec, pub(crate) max_values: Vec, @@ -241,6 +241,37 @@ impl PrimitiveColumnIndex { }) } + pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { + let min_values = self + .min_values + .iter() + .map(|x| x.as_bytes().to_vec()) + .collect::>(); + + let max_values = self + .max_values + .iter() + .map(|x| x.as_bytes().to_vec()) + .collect::>(); + + let null_counts = self.null_counts.clone(); + let repetition_level_histograms = self.repetition_level_histograms.clone(); + let definition_level_histograms = self.definition_level_histograms.clone(); + let null_pages = self.null_pages.clone(); + + crate::format::ColumnIndex::new( + null_pages, + min_values, + max_values, + self.boundary_order.into(), + null_counts, + repetition_level_histograms, + definition_level_histograms, + ) + } +} + +impl PrimitiveColumnIndex { /// Returns an array containing the min values for each page. /// /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`] @@ -304,38 +335,9 @@ impl PrimitiveColumnIndex { Some(&self.max_values[idx]) } } - - pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { - let min_values = self - .min_values - .iter() - .map(|x| x.as_bytes().to_vec()) - .collect::>(); - - let max_values = self - .max_values - .iter() - .map(|x| x.as_bytes().to_vec()) - .collect::>(); - - let null_counts = self.null_counts.clone(); - let repetition_level_histograms = self.repetition_level_histograms.clone(); - let definition_level_histograms = self.definition_level_histograms.clone(); - let null_pages = self.null_pages.clone(); - - crate::format::ColumnIndex::new( - null_pages, - min_values, - max_values, - self.boundary_order.into(), - null_counts, - repetition_level_histograms, - definition_level_histograms, - ) - } } -impl Deref for PrimitiveColumnIndex { +impl Deref for PrimitiveColumnIndex { type Target = ColumnIndex; fn deref(&self) -> &Self::Target { @@ -437,7 +439,7 @@ impl ByteArrayColumnIndex { /// /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. pub fn min_values_iter(&self) -> impl Iterator> { - (0..self.num_pages() as usize).into_iter().map(|i| { + (0..self.num_pages() as usize).map(|i| { if self.is_null_page(i) { None } else { @@ -450,7 +452,7 @@ impl ByteArrayColumnIndex { /// /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. pub fn max_values_iter(&self) -> impl Iterator> { - (0..self.num_pages() as usize).into_iter().map(|i| { + (0..self.num_pages() as usize).map(|i| { if self.is_null_page(i) { None } else { @@ -462,12 +464,12 @@ impl ByteArrayColumnIndex { pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { let mut min_values = Vec::with_capacity(self.num_pages() as usize); for i in 0..self.num_pages() as usize { - min_values.push(self.min_value(i).unwrap_or(&vec![]).to_owned()); + min_values.push(self.min_value(i).unwrap_or(&[]).to_owned()); } let mut max_values = Vec::with_capacity(self.num_pages() as usize); for i in 0..self.num_pages() as usize { - max_values.push(self.max_value(i).unwrap_or(&vec![]).to_owned()); + max_values.push(self.max_value(i).unwrap_or(&[]).to_owned()); } let null_counts = self.null_counts.clone(); From 009632a91d6b04e519593ceb154e217d955a9c05 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 21 Aug 2025 20:20:28 -0700 Subject: [PATCH 012/126] doc fixes --- parquet/src/arrow/arrow_reader/statistics.rs | 8 ++++---- parquet/src/file/page_index/index_reader.rs | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 68dd36d0437a..c8d0c6581288 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -1186,7 +1186,7 @@ fn max_statistics<'a, I: Iterator>>( } /// Extracts the min statistics from an iterator -/// of parquet page [`Index`]'es to an [`ArrayRef`] +/// of parquet page [`ColumnIndexMetaData`]'s to an [`ArrayRef`] pub(crate) fn min_page_statistics<'a, I>( data_type: &DataType, iterator: I, @@ -1199,7 +1199,7 @@ where } /// Extracts the max statistics from an iterator -/// of parquet page [`Index`]'es to an [`ArrayRef`] +/// of parquet page [`ColumnIndexMetaData`]'s to an [`ArrayRef`] pub(crate) fn max_page_statistics<'a, I>( data_type: &DataType, iterator: I, @@ -1212,7 +1212,7 @@ where } /// Extracts the null count statistics from an iterator -/// of parquet page [`Index`]'es to an [`ArrayRef`] +/// of parquet page [`ColumnIndexMetaData`]'s to an [`ArrayRef`] /// /// The returned Array is an [`UInt64Array`] pub(crate) fn null_counts_page_statistics<'a, I>(iterator: I) -> Result @@ -1552,7 +1552,7 @@ impl<'a> StatisticsConverter<'a> { /// page level statistics can prune at a finer granularity. /// /// However since they are stored in a separate metadata - /// structure ([`Index`]) there is different code to extract them as + /// structure ([`ColumnIndexMetaData`]) there is different code to extract them as /// compared to arrow statistics. /// /// # Parameters: diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index d37ee789f728..d9358486ed84 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Support for reading [`Index`] and [`OffsetIndexMetaData`] from parquet metadata. +//! Support for reading [`ColumnIndexMetaData`] and [`OffsetIndexMetaData`] from parquet metadata. use crate::basic::{BoundaryOrder, Type}; use crate::data_type::private::ParquetValueType; @@ -38,7 +38,7 @@ pub(crate) fn acc_range(a: Option>, b: Option>) -> Option< } } -/// Reads per-column [`Index`] for all columns of a row group by +/// Reads per-column [`ColumnIndexMetaData`] for all columns of a row group by /// decoding [`ColumnIndex`] . /// /// Returns a vector of `index[column_number]`. From a822dfd1f97b1e7f3722a36e8cc98b989e029994 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 22 Aug 2025 08:35:47 -0700 Subject: [PATCH 013/126] move column index to its own module --- parquet/src/arrow/arrow_reader/statistics.rs | 2 +- parquet/src/arrow/arrow_writer/mod.rs | 2 +- parquet/src/bin/parquet-index.rs | 2 +- parquet/src/file/metadata/memory.rs | 4 +- parquet/src/file/metadata/mod.rs | 4 +- parquet/src/file/metadata/reader.rs | 2 +- parquet/src/file/metadata/writer.rs | 2 +- parquet/src/file/page_index/column_index.rs | 514 +++++++++++++++++++ parquet/src/file/page_index/index.rs | 2 +- parquet/src/file/page_index/index_reader.rs | 495 +----------------- parquet/src/file/page_index/mod.rs | 1 + parquet/src/file/serialized_reader.rs | 2 +- parquet/src/file/writer.rs | 2 +- parquet/tests/encryption/encryption_util.rs | 2 +- 14 files changed, 534 insertions(+), 502 deletions(-) create mode 100644 parquet/src/file/page_index/column_index.rs diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index c8d0c6581288..d98732f5d075 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -25,7 +25,7 @@ use crate::basic::Type as PhysicalType; use crate::data_type::{ByteArray, FixedLenByteArray}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData}; -use crate::file::page_index::index_reader::ColumnIndexMetaData; +use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::statistics::Statistics as ParquetStatistics; use crate::schema::types::SchemaDescriptor; use arrow_array::builder::{ diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 1041a1af1f77..bd9f30c36103 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1488,7 +1488,7 @@ mod tests { use crate::arrow::ARROW_SCHEMA_META_KEY; use crate::column::page::{Page, PageReader}; use crate::file::page_encoding_stats::PageEncodingStats; - use crate::file::page_index::index_reader::ColumnIndexMetaData; + use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::reader::SerializedPageReader; use crate::format::PageHeader; use crate::schema::types::ColumnPath; diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index 161a1507a146..397a75c76ae4 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -37,7 +37,7 @@ use clap::Parser; use parquet::data_type::ByteArray; use parquet::errors::{ParquetError, Result}; -use parquet::file::page_index::index_reader::{ +use parquet::file::page_index::column_index::{ ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex, }; use parquet::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 5c1477e2cb14..69eee3c2999d 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -24,10 +24,10 @@ use crate::file::metadata::{ ColumnChunkMetaData, FileMetaData, KeyValue, RowGroupMetaData, SortingColumn, }; use crate::file::page_encoding_stats::PageEncodingStats; -use crate::file::page_index::index::{Index, NativeIndex, PageIndex}; -use crate::file::page_index::index_reader::{ +use crate::file::page_index::column_index::{ ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex, }; +use crate::file::page_index::index::{Index, NativeIndex, PageIndex}; use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use crate::file::statistics::{Statistics, ValueStatistics}; use std::sync::Arc; diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index a619d76658e9..69cdf8f10714 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -106,7 +106,7 @@ use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::{ page_encoding_stats::{self, PageEncodingStats}, - page_index::{index_reader::ColumnIndexMetaData, offset_index::PageLocation}, + page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation}, }; use crate::file::{ page_index::index::PageIndex, @@ -1948,7 +1948,7 @@ impl OffsetIndexBuilder { mod tests { use super::*; use crate::basic::{PageType, SortOrder}; - use crate::file::page_index::index_reader::{ColumnIndex, PrimitiveColumnIndex}; + use crate::file::page_index::column_index::{ColumnIndex, PrimitiveColumnIndex}; #[test] fn test_row_group_metadata_thrift_conversion() { diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 97ea72ef964c..57cc7c57ac66 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -34,7 +34,7 @@ use bytes::Bytes; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData, RowGroupMetaData}; -use crate::file::page_index::index_reader::ColumnIndexMetaData; +use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index}; use crate::file::reader::ChunkReader; use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER}; diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 8c485f7d0e8b..404bcf5dba8a 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -32,7 +32,7 @@ use crate::format::{AesGcmV1, ColumnCryptoMetaData}; use crate::schema::types; use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr}; use crate::thrift::TSerializable; -use crate::{errors::Result, file::page_index::index_reader::ColumnIndexMetaData}; +use crate::{errors::Result, file::page_index::column_index::ColumnIndexMetaData}; use std::io::Write; use std::sync::Arc; use thrift::protocol::TCompactOutputProtocol; diff --git a/parquet/src/file/page_index/column_index.rs b/parquet/src/file/page_index/column_index.rs new file mode 100644 index 000000000000..3fb6003e7c66 --- /dev/null +++ b/parquet/src/file/page_index/column_index.rs @@ -0,0 +1,514 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`ColumnIndexMetaData`] structures holding decoded [`ColumnIndex`] information +//! +//! [`ColumnIndex`]: crate::format::ColumnIndex +//! + +use crate::errors::Result; +use std::ops::Deref; + +use crate::{ + basic::BoundaryOrder, + data_type::{private::ParquetValueType, Int96}, + file::page_index::index_reader::ThriftColumnIndex, +}; + +/// Common bits of the column index +#[derive(Debug, Clone, PartialEq)] +pub struct ColumnIndex { + pub(crate) null_pages: Vec, + pub(crate) boundary_order: BoundaryOrder, + pub(crate) null_counts: Option>, + pub(crate) repetition_level_histograms: Option>, + pub(crate) definition_level_histograms: Option>, +} + +impl ColumnIndex { + /// Returns the number of pages + pub fn num_pages(&self) -> u64 { + self.null_pages.len() as u64 + } + + /// Returns the number of null values in the page indexed by `idx` + /// + /// Returns `None` if no null counts have been set in the index + pub fn null_count(&self, idx: usize) -> Option { + self.null_counts.as_ref().map(|nc| nc[idx]) + } + + /// Returns the repetition level histogram for the page indexed by `idx` + pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> { + if let Some(rep_hists) = self.repetition_level_histograms.as_ref() { + let num_lvls = rep_hists.len() / self.num_pages() as usize; + let start = num_lvls * idx; + Some(&rep_hists[start..start + num_lvls]) + } else { + None + } + } + + /// Returns the definition level histogram for the page indexed by `idx` + pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> { + if let Some(def_hists) = self.definition_level_histograms.as_ref() { + let num_lvls = def_hists.len() / self.num_pages() as usize; + let start = num_lvls * idx; + Some(&def_hists[start..start + num_lvls]) + } else { + None + } + } + + /// Returns whether the page indexed by `idx` consists of all null values + pub fn is_null_page(&self, idx: usize) -> bool { + self.null_pages[idx] + } +} + +/// Column index for primitive types +#[derive(Debug, Clone, PartialEq)] +pub struct PrimitiveColumnIndex { + pub(crate) column_index: ColumnIndex, + pub(crate) min_values: Vec, + pub(crate) max_values: Vec, +} + +impl PrimitiveColumnIndex { + pub(super) fn try_new(index: ThriftColumnIndex) -> Result { + let len = index.null_pages.len(); + + let mut min_values = Vec::with_capacity(len); + let mut max_values = Vec::with_capacity(len); + + for (i, is_null) in index.null_pages.iter().enumerate().take(len) { + if !is_null { + let min = index.min_values[i]; + min_values.push(T::try_from_le_slice(min)?); + + let max = index.max_values[i]; + max_values.push(T::try_from_le_slice(max)?); + } else { + // need placeholders + min_values.push(Default::default()); + max_values.push(Default::default()); + } + } + + Ok(Self { + column_index: ColumnIndex { + null_pages: index.null_pages, + boundary_order: index.boundary_order, + null_counts: index.null_counts, + repetition_level_histograms: index.repetition_level_histograms, + definition_level_histograms: index.definition_level_histograms, + }, + min_values, + max_values, + }) + } + + pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { + let min_values = self + .min_values + .iter() + .map(|x| x.as_bytes().to_vec()) + .collect::>(); + + let max_values = self + .max_values + .iter() + .map(|x| x.as_bytes().to_vec()) + .collect::>(); + + let null_counts = self.null_counts.clone(); + let repetition_level_histograms = self.repetition_level_histograms.clone(); + let definition_level_histograms = self.definition_level_histograms.clone(); + let null_pages = self.null_pages.clone(); + + crate::format::ColumnIndex::new( + null_pages, + min_values, + max_values, + self.boundary_order.into(), + null_counts, + repetition_level_histograms, + definition_level_histograms, + ) + } +} + +impl PrimitiveColumnIndex { + /// Returns an array containing the min values for each page. + /// + /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`] + /// is `false` for the same index. + pub fn min_values(&self) -> &[T] { + &self.min_values + } + + /// Returns an array containing the max values for each page. + /// + /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`] + /// is `false` for the same index. + pub fn max_values(&self) -> &[T] { + &self.max_values + } + + /// Returns an iterator over the min values. + /// + /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. + pub fn min_values_iter(&self) -> impl Iterator> { + self.min_values.iter().enumerate().map(|(i, min)| { + if self.is_null_page(i) { + None + } else { + Some(min) + } + }) + } + + /// Returns an iterator over the max values. + /// + /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. + pub fn max_values_iter(&self) -> impl Iterator> { + self.max_values.iter().enumerate().map(|(i, min)| { + if self.is_null_page(i) { + None + } else { + Some(min) + } + }) + } + + /// Returns the min value for the page indexed by `idx` + /// + /// It is `None` when all values are null + pub fn min_value(&self, idx: usize) -> Option<&T> { + if self.null_pages[idx] { + None + } else { + Some(&self.min_values[idx]) + } + } + + /// Returns the max value for the page indexed by `idx` + /// + /// It is `None` when all values are null + pub fn max_value(&self, idx: usize) -> Option<&T> { + if self.null_pages[idx] { + None + } else { + Some(&self.max_values[idx]) + } + } +} + +impl Deref for PrimitiveColumnIndex { + type Target = ColumnIndex; + + fn deref(&self) -> &Self::Target { + &self.column_index + } +} + +/// Column index for byte arrays (fixed length and variable) +#[derive(Debug, Clone, PartialEq)] +pub struct ByteArrayColumnIndex { + pub(crate) column_index: ColumnIndex, + // raw bytes for min and max values + pub(crate) min_bytes: Vec, + pub(crate) min_offsets: Vec, + pub(crate) max_bytes: Vec, + pub(crate) max_offsets: Vec, +} + +impl ByteArrayColumnIndex { + pub(super) fn try_new(index: ThriftColumnIndex) -> Result { + let len = index.null_pages.len(); + + let min_len = index.min_values.iter().map(|&v| v.len()).sum(); + let max_len = index.max_values.iter().map(|&v| v.len()).sum(); + let mut min_bytes = vec![0u8; min_len]; + let mut max_bytes = vec![0u8; max_len]; + + let mut min_offsets = vec![0usize; len + 1]; + let mut max_offsets = vec![0usize; len + 1]; + + let mut min_pos = 0; + let mut max_pos = 0; + + for (i, is_null) in index.null_pages.iter().enumerate().take(len) { + if !is_null { + let min = index.min_values[i]; + let dst = &mut min_bytes[min_pos..min_pos + min.len()]; + dst.copy_from_slice(min); + min_offsets[i] = min_pos; + min_pos += min.len(); + + let max = index.max_values[i]; + let dst = &mut max_bytes[max_pos..max_pos + max.len()]; + dst.copy_from_slice(max); + max_offsets[i] = max_pos; + max_pos += max.len(); + } else { + min_offsets[i] = min_pos; + max_offsets[i] = max_pos; + } + } + + min_offsets[len] = min_pos; + max_offsets[len] = max_pos; + + Ok(Self { + column_index: ColumnIndex { + null_pages: index.null_pages, + boundary_order: index.boundary_order, + null_counts: index.null_counts, + repetition_level_histograms: index.repetition_level_histograms, + definition_level_histograms: index.definition_level_histograms, + }, + + min_bytes, + min_offsets, + max_bytes, + max_offsets, + }) + } + + /// Returns the min value for the page indexed by `idx` + /// + /// It is `None` when all values are null + pub fn min_value(&self, idx: usize) -> Option<&[u8]> { + if self.null_pages[idx] { + None + } else { + let start = self.min_offsets[idx]; + let end = self.min_offsets[idx + 1]; + Some(&self.min_bytes[start..end]) + } + } + + /// Returns the max value for the page indexed by `idx` + /// + /// It is `None` when all values are null + pub fn max_value(&self, idx: usize) -> Option<&[u8]> { + if self.null_pages[idx] { + None + } else { + let start = self.max_offsets[idx]; + let end = self.max_offsets[idx + 1]; + Some(&self.max_bytes[start..end]) + } + } + + /// Returns an iterator over the min values. + /// + /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. + pub fn min_values_iter(&self) -> impl Iterator> { + (0..self.num_pages() as usize).map(|i| { + if self.is_null_page(i) { + None + } else { + self.min_value(i) + } + }) + } + + /// Returns an iterator over the max values. + /// + /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. + pub fn max_values_iter(&self) -> impl Iterator> { + (0..self.num_pages() as usize).map(|i| { + if self.is_null_page(i) { + None + } else { + self.max_value(i) + } + }) + } + + pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { + let mut min_values = Vec::with_capacity(self.num_pages() as usize); + for i in 0..self.num_pages() as usize { + min_values.push(self.min_value(i).unwrap_or(&[]).to_owned()); + } + + let mut max_values = Vec::with_capacity(self.num_pages() as usize); + for i in 0..self.num_pages() as usize { + max_values.push(self.max_value(i).unwrap_or(&[]).to_owned()); + } + + let null_counts = self.null_counts.clone(); + let repetition_level_histograms = self.repetition_level_histograms.clone(); + let definition_level_histograms = self.definition_level_histograms.clone(); + let null_pages = self.null_pages.clone(); + + crate::format::ColumnIndex::new( + null_pages, + min_values, + max_values, + self.boundary_order.into(), + null_counts, + repetition_level_histograms, + definition_level_histograms, + ) + } +} + +impl Deref for ByteArrayColumnIndex { + type Target = ColumnIndex; + + fn deref(&self) -> &Self::Target { + &self.column_index + } +} + +// Macro to generate getter functions for ColumnIndexMetaData. +macro_rules! colidx_enum_func { + ($self:ident, $func:ident, $arg:ident) => {{ + match *$self { + Self::BOOLEAN(ref typed) => typed.$func($arg), + Self::INT32(ref typed) => typed.$func($arg), + Self::INT64(ref typed) => typed.$func($arg), + Self::INT96(ref typed) => typed.$func($arg), + Self::FLOAT(ref typed) => typed.$func($arg), + Self::DOUBLE(ref typed) => typed.$func($arg), + Self::BYTE_ARRAY(ref typed) => typed.$func($arg), + Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func($arg), + _ => panic!(concat!( + "Cannot call ", + stringify!($func), + " on ColumnIndexMetaData::NONE" + )), + } + }}; + ($self:ident, $func:ident) => {{ + match *$self { + Self::BOOLEAN(ref typed) => typed.$func(), + Self::INT32(ref typed) => typed.$func(), + Self::INT64(ref typed) => typed.$func(), + Self::INT96(ref typed) => typed.$func(), + Self::FLOAT(ref typed) => typed.$func(), + Self::DOUBLE(ref typed) => typed.$func(), + Self::BYTE_ARRAY(ref typed) => typed.$func(), + Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func(), + _ => panic!(concat!( + "Cannot call ", + stringify!($func), + " on ColumnIndexMetaData::NONE" + )), + } + }}; +} + +/// index +#[derive(Debug, Clone, PartialEq)] +#[allow(non_camel_case_types)] +pub enum ColumnIndexMetaData { + /// Sometimes reading page index from parquet file + /// will only return pageLocations without min_max index, + /// `NONE` represents this lack of index information + NONE, + /// Boolean type index + BOOLEAN(PrimitiveColumnIndex), + /// 32-bit integer type index + INT32(PrimitiveColumnIndex), + /// 64-bit integer type index + INT64(PrimitiveColumnIndex), + /// 96-bit integer type (timestamp) index + INT96(PrimitiveColumnIndex), + /// 32-bit floating point type index + FLOAT(PrimitiveColumnIndex), + /// 64-bit floating point type index + DOUBLE(PrimitiveColumnIndex), + /// Byte array type index + BYTE_ARRAY(ByteArrayColumnIndex), + /// Fixed length byte array type index + FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex), +} + +impl ColumnIndexMetaData { + /// Return min/max elements inside ColumnIndex are ordered or not. + pub fn is_sorted(&self) -> bool { + // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING, + if let Some(order) = self.get_boundary_order() { + order != BoundaryOrder::UNORDERED + } else { + false + } + } + + /// Get boundary_order of this page index. + pub fn get_boundary_order(&self) -> Option { + match self { + Self::NONE => None, + Self::BOOLEAN(index) => Some(index.boundary_order), + Self::INT32(index) => Some(index.boundary_order), + Self::INT64(index) => Some(index.boundary_order), + Self::INT96(index) => Some(index.boundary_order), + Self::FLOAT(index) => Some(index.boundary_order), + Self::DOUBLE(index) => Some(index.boundary_order), + Self::BYTE_ARRAY(index) => Some(index.boundary_order), + Self::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order), + } + } + + /// Returns array of null counts, one per page. + /// + /// Returns `None` if now null counts have been set in the index + pub fn null_counts(&self) -> Option<&Vec> { + match self { + Self::NONE => None, + Self::BOOLEAN(index) => index.null_counts.as_ref(), + Self::INT32(index) => index.null_counts.as_ref(), + Self::INT64(index) => index.null_counts.as_ref(), + Self::INT96(index) => index.null_counts.as_ref(), + Self::FLOAT(index) => index.null_counts.as_ref(), + Self::DOUBLE(index) => index.null_counts.as_ref(), + Self::BYTE_ARRAY(index) => index.null_counts.as_ref(), + Self::FIXED_LEN_BYTE_ARRAY(index) => index.null_counts.as_ref(), + } + } + + /// Returns the number of pages + pub fn num_pages(&self) -> u64 { + colidx_enum_func!(self, num_pages) + } + + /// Returns the number of null values in the page indexed by `idx` + /// + /// Returns `None` if no null counts have been set in the index + pub fn null_count(&self, idx: usize) -> Option { + colidx_enum_func!(self, null_count, idx) + } + + /// Returns the repetition level histogram for the page indexed by `idx` + pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> { + colidx_enum_func!(self, repetition_level_histogram, idx) + } + + /// Returns the definition level histogram for the page indexed by `idx` + pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> { + colidx_enum_func!(self, definition_level_histogram, idx) + } + + /// Returns whether the page indexed by `idx` consists of all null values + pub fn is_null_page(&self, idx: usize) -> bool { + colidx_enum_func!(self, is_null_page, idx) + } +} diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 22d6e92666db..861dc0c3b04e 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -311,7 +311,7 @@ impl NativeIndex { /// Creates a new [`NativeIndex`] #[allow(dead_code)] - pub(crate) fn try_new_local(index: ThriftColumnIndex) -> Result { + pub(super) fn try_new_local(index: ThriftColumnIndex) -> Result { let len = index.min_values.len(); // turn Option> into Vec> diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index d9358486ed84..f35241689e1c 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -18,15 +18,17 @@ //! Support for reading [`ColumnIndexMetaData`] and [`OffsetIndexMetaData`] from parquet metadata. use crate::basic::{BoundaryOrder, Type}; -use crate::data_type::private::ParquetValueType; use crate::data_type::Int96; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ColumnChunkMetaData; +use crate::file::page_index::column_index::{ + ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex, +}; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::ChunkReader; use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; use crate::thrift_struct; -use std::ops::{Deref, Range}; +use std::ops::Range; /// Computes the covering range of two optional ranges /// @@ -134,8 +136,9 @@ pub(crate) fn decode_offset_index(data: &[u8]) -> Result { +pub(super) struct ThriftColumnIndex<'a> { 1: required list null_pages 2: required list<'a> min_values 3: required list<'a> max_values @@ -146,492 +149,6 @@ pub(crate) struct ThriftColumnIndex<'a> { } ); -// TODO: the following should move to its own module - -/// Common bits of the column index -#[derive(Debug, Clone, PartialEq)] -pub struct ColumnIndex { - pub(crate) null_pages: Vec, - pub(crate) boundary_order: BoundaryOrder, - pub(crate) null_counts: Option>, - pub(crate) repetition_level_histograms: Option>, - pub(crate) definition_level_histograms: Option>, -} - -impl ColumnIndex { - /// Returns the number of pages - pub fn num_pages(&self) -> u64 { - self.null_pages.len() as u64 - } - - /// Returns the number of null values in the page indexed by `idx` - /// - /// Returns `None` if no null counts have been set in the index - pub fn null_count(&self, idx: usize) -> Option { - self.null_counts.as_ref().map(|nc| nc[idx]) - } - - /// Returns the repetition level histogram for the page indexed by `idx` - pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> { - if let Some(rep_hists) = self.repetition_level_histograms.as_ref() { - let num_lvls = rep_hists.len() / self.num_pages() as usize; - let start = num_lvls * idx; - Some(&rep_hists[start..start + num_lvls]) - } else { - None - } - } - - /// Returns the definition level histogram for the page indexed by `idx` - pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> { - if let Some(def_hists) = self.definition_level_histograms.as_ref() { - let num_lvls = def_hists.len() / self.num_pages() as usize; - let start = num_lvls * idx; - Some(&def_hists[start..start + num_lvls]) - } else { - None - } - } - - /// Returns whether the page indexed by `idx` consists of all null values - pub fn is_null_page(&self, idx: usize) -> bool { - self.null_pages[idx] - } -} - -/// Column index for primitive types -#[derive(Debug, Clone, PartialEq)] -pub struct PrimitiveColumnIndex { - pub(crate) column_index: ColumnIndex, - pub(crate) min_values: Vec, - pub(crate) max_values: Vec, -} - -impl PrimitiveColumnIndex { - fn try_new(index: ThriftColumnIndex) -> Result { - let len = index.null_pages.len(); - - let mut min_values = Vec::with_capacity(len); - let mut max_values = Vec::with_capacity(len); - - for (i, is_null) in index.null_pages.iter().enumerate().take(len) { - if !is_null { - let min = index.min_values[i]; - min_values.push(T::try_from_le_slice(min)?); - - let max = index.max_values[i]; - max_values.push(T::try_from_le_slice(max)?); - } else { - // need placeholders - min_values.push(Default::default()); - max_values.push(Default::default()); - } - } - - Ok(Self { - column_index: ColumnIndex { - null_pages: index.null_pages, - boundary_order: index.boundary_order, - null_counts: index.null_counts, - repetition_level_histograms: index.repetition_level_histograms, - definition_level_histograms: index.definition_level_histograms, - }, - min_values, - max_values, - }) - } - - pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { - let min_values = self - .min_values - .iter() - .map(|x| x.as_bytes().to_vec()) - .collect::>(); - - let max_values = self - .max_values - .iter() - .map(|x| x.as_bytes().to_vec()) - .collect::>(); - - let null_counts = self.null_counts.clone(); - let repetition_level_histograms = self.repetition_level_histograms.clone(); - let definition_level_histograms = self.definition_level_histograms.clone(); - let null_pages = self.null_pages.clone(); - - crate::format::ColumnIndex::new( - null_pages, - min_values, - max_values, - self.boundary_order.into(), - null_counts, - repetition_level_histograms, - definition_level_histograms, - ) - } -} - -impl PrimitiveColumnIndex { - /// Returns an array containing the min values for each page. - /// - /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`] - /// is `false` for the same index. - pub fn min_values(&self) -> &[T] { - &self.min_values - } - - /// Returns an array containing the max values for each page. - /// - /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`] - /// is `false` for the same index. - pub fn max_values(&self) -> &[T] { - &self.max_values - } - - /// Returns an iterator over the min values. - /// - /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. - pub fn min_values_iter(&self) -> impl Iterator> { - self.min_values.iter().enumerate().map(|(i, min)| { - if self.is_null_page(i) { - None - } else { - Some(min) - } - }) - } - - /// Returns an iterator over the max values. - /// - /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. - pub fn max_values_iter(&self) -> impl Iterator> { - self.max_values.iter().enumerate().map(|(i, min)| { - if self.is_null_page(i) { - None - } else { - Some(min) - } - }) - } - - /// Returns the min value for the page indexed by `idx` - /// - /// It is `None` when all values are null - pub fn min_value(&self, idx: usize) -> Option<&T> { - if self.null_pages[idx] { - None - } else { - Some(&self.min_values[idx]) - } - } - - /// Returns the max value for the page indexed by `idx` - /// - /// It is `None` when all values are null - pub fn max_value(&self, idx: usize) -> Option<&T> { - if self.null_pages[idx] { - None - } else { - Some(&self.max_values[idx]) - } - } -} - -impl Deref for PrimitiveColumnIndex { - type Target = ColumnIndex; - - fn deref(&self) -> &Self::Target { - &self.column_index - } -} - -/// Column index for byte arrays (fixed length and variable) -#[derive(Debug, Clone, PartialEq)] -pub struct ByteArrayColumnIndex { - pub(crate) column_index: ColumnIndex, - // raw bytes for min and max values - pub(crate) min_bytes: Vec, - pub(crate) min_offsets: Vec, - pub(crate) max_bytes: Vec, - pub(crate) max_offsets: Vec, -} - -impl ByteArrayColumnIndex { - fn try_new(index: ThriftColumnIndex) -> Result { - let len = index.null_pages.len(); - - let min_len = index.min_values.iter().map(|&v| v.len()).sum(); - let max_len = index.max_values.iter().map(|&v| v.len()).sum(); - let mut min_bytes = vec![0u8; min_len]; - let mut max_bytes = vec![0u8; max_len]; - - let mut min_offsets = vec![0usize; len + 1]; - let mut max_offsets = vec![0usize; len + 1]; - - let mut min_pos = 0; - let mut max_pos = 0; - - for (i, is_null) in index.null_pages.iter().enumerate().take(len) { - if !is_null { - let min = index.min_values[i]; - let dst = &mut min_bytes[min_pos..min_pos + min.len()]; - dst.copy_from_slice(min); - min_offsets[i] = min_pos; - min_pos += min.len(); - - let max = index.max_values[i]; - let dst = &mut max_bytes[max_pos..max_pos + max.len()]; - dst.copy_from_slice(max); - max_offsets[i] = max_pos; - max_pos += max.len(); - } else { - min_offsets[i] = min_pos; - max_offsets[i] = max_pos; - } - } - - min_offsets[len] = min_pos; - max_offsets[len] = max_pos; - - Ok(Self { - column_index: ColumnIndex { - null_pages: index.null_pages, - boundary_order: index.boundary_order, - null_counts: index.null_counts, - repetition_level_histograms: index.repetition_level_histograms, - definition_level_histograms: index.definition_level_histograms, - }, - - min_bytes, - min_offsets, - max_bytes, - max_offsets, - }) - } - - /// Returns the min value for the page indexed by `idx` - /// - /// It is `None` when all values are null - pub fn min_value(&self, idx: usize) -> Option<&[u8]> { - if self.null_pages[idx] { - None - } else { - let start = self.min_offsets[idx]; - let end = self.min_offsets[idx + 1]; - Some(&self.min_bytes[start..end]) - } - } - - /// Returns the max value for the page indexed by `idx` - /// - /// It is `None` when all values are null - pub fn max_value(&self, idx: usize) -> Option<&[u8]> { - if self.null_pages[idx] { - None - } else { - let start = self.max_offsets[idx]; - let end = self.max_offsets[idx + 1]; - Some(&self.max_bytes[start..end]) - } - } - - /// Returns an iterator over the min values. - /// - /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. - pub fn min_values_iter(&self) -> impl Iterator> { - (0..self.num_pages() as usize).map(|i| { - if self.is_null_page(i) { - None - } else { - self.min_value(i) - } - }) - } - - /// Returns an iterator over the max values. - /// - /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`. - pub fn max_values_iter(&self) -> impl Iterator> { - (0..self.num_pages() as usize).map(|i| { - if self.is_null_page(i) { - None - } else { - self.max_value(i) - } - }) - } - - pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { - let mut min_values = Vec::with_capacity(self.num_pages() as usize); - for i in 0..self.num_pages() as usize { - min_values.push(self.min_value(i).unwrap_or(&[]).to_owned()); - } - - let mut max_values = Vec::with_capacity(self.num_pages() as usize); - for i in 0..self.num_pages() as usize { - max_values.push(self.max_value(i).unwrap_or(&[]).to_owned()); - } - - let null_counts = self.null_counts.clone(); - let repetition_level_histograms = self.repetition_level_histograms.clone(); - let definition_level_histograms = self.definition_level_histograms.clone(); - let null_pages = self.null_pages.clone(); - - crate::format::ColumnIndex::new( - null_pages, - min_values, - max_values, - self.boundary_order.into(), - null_counts, - repetition_level_histograms, - definition_level_histograms, - ) - } -} - -impl Deref for ByteArrayColumnIndex { - type Target = ColumnIndex; - - fn deref(&self) -> &Self::Target { - &self.column_index - } -} - -// Macro to generate getter functions for ColumnIndexMetaData. -macro_rules! colidx_enum_func { - ($self:ident, $func:ident, $arg:ident) => {{ - match *$self { - Self::BOOLEAN(ref typed) => typed.$func($arg), - Self::INT32(ref typed) => typed.$func($arg), - Self::INT64(ref typed) => typed.$func($arg), - Self::INT96(ref typed) => typed.$func($arg), - Self::FLOAT(ref typed) => typed.$func($arg), - Self::DOUBLE(ref typed) => typed.$func($arg), - Self::BYTE_ARRAY(ref typed) => typed.$func($arg), - Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func($arg), - _ => panic!(concat!( - "Cannot call ", - stringify!($func), - " on ColumnIndexMetaData::NONE" - )), - } - }}; - ($self:ident, $func:ident) => {{ - match *$self { - Self::BOOLEAN(ref typed) => typed.$func(), - Self::INT32(ref typed) => typed.$func(), - Self::INT64(ref typed) => typed.$func(), - Self::INT96(ref typed) => typed.$func(), - Self::FLOAT(ref typed) => typed.$func(), - Self::DOUBLE(ref typed) => typed.$func(), - Self::BYTE_ARRAY(ref typed) => typed.$func(), - Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func(), - _ => panic!(concat!( - "Cannot call ", - stringify!($func), - " on ColumnIndexMetaData::NONE" - )), - } - }}; -} - -/// index -#[derive(Debug, Clone, PartialEq)] -#[allow(non_camel_case_types)] -pub enum ColumnIndexMetaData { - /// Sometimes reading page index from parquet file - /// will only return pageLocations without min_max index, - /// `NONE` represents this lack of index information - NONE, - /// Boolean type index - BOOLEAN(PrimitiveColumnIndex), - /// 32-bit integer type index - INT32(PrimitiveColumnIndex), - /// 64-bit integer type index - INT64(PrimitiveColumnIndex), - /// 96-bit integer type (timestamp) index - INT96(PrimitiveColumnIndex), - /// 32-bit floating point type index - FLOAT(PrimitiveColumnIndex), - /// 64-bit floating point type index - DOUBLE(PrimitiveColumnIndex), - /// Byte array type index - BYTE_ARRAY(ByteArrayColumnIndex), - /// Fixed length byte array type index - FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex), -} - -impl ColumnIndexMetaData { - /// Return min/max elements inside ColumnIndex are ordered or not. - pub fn is_sorted(&self) -> bool { - // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING, - if let Some(order) = self.get_boundary_order() { - order != BoundaryOrder::UNORDERED - } else { - false - } - } - - /// Get boundary_order of this page index. - pub fn get_boundary_order(&self) -> Option { - match self { - Self::NONE => None, - Self::BOOLEAN(index) => Some(index.boundary_order), - Self::INT32(index) => Some(index.boundary_order), - Self::INT64(index) => Some(index.boundary_order), - Self::INT96(index) => Some(index.boundary_order), - Self::FLOAT(index) => Some(index.boundary_order), - Self::DOUBLE(index) => Some(index.boundary_order), - Self::BYTE_ARRAY(index) => Some(index.boundary_order), - Self::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order), - } - } - - /// Returns array of null counts, one per page. - /// - /// Returns `None` if now null counts have been set in the index - pub fn null_counts(&self) -> Option<&Vec> { - match self { - Self::NONE => None, - Self::BOOLEAN(index) => index.null_counts.as_ref(), - Self::INT32(index) => index.null_counts.as_ref(), - Self::INT64(index) => index.null_counts.as_ref(), - Self::INT96(index) => index.null_counts.as_ref(), - Self::FLOAT(index) => index.null_counts.as_ref(), - Self::DOUBLE(index) => index.null_counts.as_ref(), - Self::BYTE_ARRAY(index) => index.null_counts.as_ref(), - Self::FIXED_LEN_BYTE_ARRAY(index) => index.null_counts.as_ref(), - } - } - - /// Returns the number of pages - pub fn num_pages(&self) -> u64 { - colidx_enum_func!(self, num_pages) - } - - /// Returns the number of null values in the page indexed by `idx` - /// - /// Returns `None` if no null counts have been set in the index - pub fn null_count(&self, idx: usize) -> Option { - colidx_enum_func!(self, null_count, idx) - } - - /// Returns the repetition level histogram for the page indexed by `idx` - pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> { - colidx_enum_func!(self, repetition_level_histogram, idx) - } - - /// Returns the definition level histogram for the page indexed by `idx` - pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> { - colidx_enum_func!(self, definition_level_histogram, idx) - } - - /// Returns whether the page indexed by `idx` consists of all null values - pub fn is_null_page(&self, idx: usize) -> bool { - colidx_enum_func!(self, is_null_page, idx) - } -} - pub(crate) fn decode_column_index( data: &[u8], column_type: Type, diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs index a8077896db34..ff70e2eca5dd 100644 --- a/parquet/src/file/page_index/mod.rs +++ b/parquet/src/file/page_index/mod.rs @@ -19,6 +19,7 @@ //! //! [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +pub mod column_index; pub mod index; pub mod index_reader; pub mod offset_index; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 101599d3246e..5308825b0976 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1102,7 +1102,7 @@ mod tests { use bytes::Buf; - use crate::file::page_index::index_reader::{ + use crate::file::page_index::column_index::{ ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex, }; use crate::file::properties::{EnabledStatistics, WriterProperties}; diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index d0101aa84a35..65b96246ea03 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1062,7 +1062,7 @@ mod tests { use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; use crate::data_type::{BoolType, ByteArrayType, Int32Type}; - use crate::file::page_index::index_reader::ColumnIndexMetaData; + use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::properties::EnabledStatistics; use crate::file::serialized_reader::ReadOptionsBuilder; use crate::file::{ diff --git a/parquet/tests/encryption/encryption_util.rs b/parquet/tests/encryption/encryption_util.rs index 549bdec47343..6817491b3024 100644 --- a/parquet/tests/encryption/encryption_util.rs +++ b/parquet/tests/encryption/encryption_util.rs @@ -191,7 +191,7 @@ pub(crate) fn verify_column_indexes(metadata: &ParquetMetaData) { let column_index = &column_index[0][float_col_idx]; match column_index { - parquet::file::page_index::index_reader::ColumnIndexMetaData::FLOAT(float_index) => { + parquet::file::page_index::column_index::ColumnIndexMetaData::FLOAT(float_index) => { assert_eq!(float_index.num_pages(), 1); assert_eq!(float_index.min_value(0), Some(&0.0f32)); assert!(float_index From 20df075edd7f71e5c5b5127ddea07582861d7c93 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 22 Aug 2025 11:21:35 -0700 Subject: [PATCH 014/126] add ColumnIndexIterators trait, simplify stats converter a little --- parquet/src/arrow/arrow_reader/statistics.rs | 123 +++---------------- parquet/src/file/page_index/column_index.rs | 57 ++++++++- 2 files changed, 76 insertions(+), 104 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index d98732f5d075..21a06050d849 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -25,7 +25,7 @@ use crate::basic::Type as PhysicalType; use crate::data_type::{ByteArray, FixedLenByteArray}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData}; -use crate::file::page_index::column_index::ColumnIndexMetaData; +use crate::file::page_index::column_index::{ColumnIndexIterators, ColumnIndexMetaData}; use crate::file::statistics::Statistics as ParquetStatistics; use crate::schema::types::SchemaDescriptor; use arrow_array::builder::{ @@ -597,7 +597,7 @@ macro_rules! get_statistics { } macro_rules! make_data_page_stats_iterator { - ($iterator_type: ident, $func: ident, $index_type: path, $stat_value_type: ty, $conv:expr) => { + ($iterator_type: ident, $func: ident, $stat_value_type: ty) => { struct $iterator_type<'a, I> where I: Iterator, @@ -624,19 +624,8 @@ macro_rules! make_data_page_stats_iterator { let next = self.iter.next(); match next { Some((len, index)) => match index { - $index_type(native_index) => Some( - native_index - .$func() - .map(|v| v.map($conv)) - .collect::>(), - ), - // No matching `Index` found; - // thus no statistics that can be extracted. - // We return vec![None; len] to effectively - // create an arrow null-array with the length - // corresponding to the number of entries in - // `ParquetOffsetIndex` per row group per column. - _ => Some(vec![None; len]), + ColumnIndexMetaData::NONE => Some(vec![None; len]), + _ => Some(<$stat_value_type>::$func(&index).collect::>()), }, _ => None, } @@ -649,118 +638,46 @@ macro_rules! make_data_page_stats_iterator { }; } -make_data_page_stats_iterator!( - MinBooleanDataPageStatsIterator, - min_values_iter, - ColumnIndexMetaData::BOOLEAN, - bool, - |m| *m -); -make_data_page_stats_iterator!( - MaxBooleanDataPageStatsIterator, - max_values_iter, - ColumnIndexMetaData::BOOLEAN, - bool, - |m| *m -); -make_data_page_stats_iterator!( - MinInt32DataPageStatsIterator, - min_values_iter, - ColumnIndexMetaData::INT32, - i32, - |m| *m -); -make_data_page_stats_iterator!( - MaxInt32DataPageStatsIterator, - max_values_iter, - ColumnIndexMetaData::INT32, - i32, - |m| *m -); -make_data_page_stats_iterator!( - MinInt64DataPageStatsIterator, - min_values_iter, - ColumnIndexMetaData::INT64, - i64, - |m| *m -); -make_data_page_stats_iterator!( - MaxInt64DataPageStatsIterator, - max_values_iter, - ColumnIndexMetaData::INT64, - i64, - |m| *m -); +make_data_page_stats_iterator!(MinBooleanDataPageStatsIterator, min_values_iter, bool); +make_data_page_stats_iterator!(MaxBooleanDataPageStatsIterator, max_values_iter, bool); +make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min_values_iter, i32); +make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max_values_iter, i32); +make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min_values_iter, i64); +make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max_values_iter, i64); make_data_page_stats_iterator!( MinFloat16DataPageStatsIterator, min_values_iter, - ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY, - FixedLenByteArray, - |m| FixedLenByteArray::from(m.to_owned()) + FixedLenByteArray ); make_data_page_stats_iterator!( MaxFloat16DataPageStatsIterator, max_values_iter, - ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY, - FixedLenByteArray, - |m| FixedLenByteArray::from(m.to_owned()) -); -make_data_page_stats_iterator!( - MinFloat32DataPageStatsIterator, - min_values_iter, - ColumnIndexMetaData::FLOAT, - f32, - |m| *m -); -make_data_page_stats_iterator!( - MaxFloat32DataPageStatsIterator, - max_values_iter, - ColumnIndexMetaData::FLOAT, - f32, - |m| *m -); -make_data_page_stats_iterator!( - MinFloat64DataPageStatsIterator, - min_values_iter, - ColumnIndexMetaData::DOUBLE, - f64, - |m| *m -); -make_data_page_stats_iterator!( - MaxFloat64DataPageStatsIterator, - max_values_iter, - ColumnIndexMetaData::DOUBLE, - f64, - |m| *m + FixedLenByteArray ); +make_data_page_stats_iterator!(MinFloat32DataPageStatsIterator, min_values_iter, f32); +make_data_page_stats_iterator!(MaxFloat32DataPageStatsIterator, max_values_iter, f32); +make_data_page_stats_iterator!(MinFloat64DataPageStatsIterator, min_values_iter, f64); +make_data_page_stats_iterator!(MaxFloat64DataPageStatsIterator, max_values_iter, f64); make_data_page_stats_iterator!( MinByteArrayDataPageStatsIterator, min_values_iter, - ColumnIndexMetaData::BYTE_ARRAY, - ByteArray, - |m| ByteArray::from(m.to_owned()) + ByteArray ); make_data_page_stats_iterator!( MaxByteArrayDataPageStatsIterator, max_values_iter, - ColumnIndexMetaData::BYTE_ARRAY, - ByteArray, - |m| ByteArray::from(m.to_owned()) + ByteArray ); make_data_page_stats_iterator!( MaxFixedLenByteArrayDataPageStatsIterator, max_values_iter, - ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY, - FixedLenByteArray, - |m| FixedLenByteArray::from(m.to_owned()) + FixedLenByteArray ); make_data_page_stats_iterator!( MinFixedLenByteArrayDataPageStatsIterator, min_values_iter, - ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY, - FixedLenByteArray, - |m| FixedLenByteArray::from(m.to_owned()) + FixedLenByteArray ); macro_rules! get_decimal_page_stats_iterator { diff --git a/parquet/src/file/page_index/column_index.rs b/parquet/src/file/page_index/column_index.rs index 3fb6003e7c66..2d43c93b2e4b 100644 --- a/parquet/src/file/page_index/column_index.rs +++ b/parquet/src/file/page_index/column_index.rs @@ -20,7 +20,10 @@ //! [`ColumnIndex`]: crate::format::ColumnIndex //! -use crate::errors::Result; +use crate::{ + data_type::{ByteArray, FixedLenByteArray}, + errors::Result, +}; use std::ops::Deref; use crate::{ @@ -512,3 +515,55 @@ impl ColumnIndexMetaData { colidx_enum_func!(self, is_null_page, idx) } } + +/// Provides iterators over min and max values of a [`ColumnIndexMetaData`] +pub trait ColumnIndexIterators { + /// Can be one of `bool`, `i32`, `i64`, `Int96`, `f32`, `f64`, [`ByteArray`], + /// or [`FixedLenByteArray`] + type Item; + + /// Return iterator over the min values for the index + fn min_values_iter(colidx: &ColumnIndexMetaData) -> impl Iterator>; + + /// Return iterator over the max values for the index + fn max_values_iter(colidx: &ColumnIndexMetaData) -> impl Iterator>; +} + +macro_rules! column_index_iters { + ($item: ident, $variant: ident, $conv:expr) => { + impl ColumnIndexIterators for $item { + type Item = $item; + + fn min_values_iter( + colidx: &ColumnIndexMetaData, + ) -> impl Iterator> { + if let ColumnIndexMetaData::$variant(index) = colidx { + index.min_values_iter().map($conv) + } else { + panic!(concat!("Wrong type for ", stringify!($item), " iterator")) + } + } + + fn max_values_iter( + colidx: &ColumnIndexMetaData, + ) -> impl Iterator> { + if let ColumnIndexMetaData::$variant(index) = colidx { + index.max_values_iter().map($conv) + } else { + panic!(concat!("Wrong type for ", stringify!($item), " iterator")) + } + } + } + }; +} + +column_index_iters!(bool, BOOLEAN, |v| v.copied()); +column_index_iters!(i32, INT32, |v| v.copied()); +column_index_iters!(i64, INT64, |v| v.copied()); +column_index_iters!(Int96, INT96, |v| v.copied()); +column_index_iters!(f32, FLOAT, |v| v.copied()); +column_index_iters!(f64, DOUBLE, |v| v.copied()); +column_index_iters!(ByteArray, BYTE_ARRAY, |v| v + .map(|v| ByteArray::from(v.to_owned()))); +column_index_iters!(FixedLenByteArray, FIXED_LEN_BYTE_ARRAY, |v| v + .map(|v| FixedLenByteArray::from(v.to_owned()))); From 7755b7b0af1cfb9d49354efc060a9486f908ae7a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 22 Aug 2025 11:24:33 -0700 Subject: [PATCH 015/126] restore comment --- parquet/src/arrow/arrow_reader/statistics.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 21a06050d849..1613656ab9ae 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -624,6 +624,12 @@ macro_rules! make_data_page_stats_iterator { let next = self.iter.next(); match next { Some((len, index)) => match index { + // No matching `Index` found; + // thus no statistics that can be extracted. + // We return vec![None; len] to effectively + // create an arrow null-array with the length + // corresponding to the number of entries in + // `ParquetOffsetIndex` per row group per column. ColumnIndexMetaData::NONE => Some(vec![None; len]), _ => Some(<$stat_value_type>::$func(&index).collect::>()), }, From f6c5738846df0e62b6f855549e8b824ae83aa9f9 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sat, 23 Aug 2025 17:09:40 -0700 Subject: [PATCH 016/126] further rework...allow for fallback to slow decoder --- parquet/src/file/page_index/index_reader.rs | 13 +- parquet/src/file/page_index/offset_index.rs | 156 ++++++++++++-------- 2 files changed, 105 insertions(+), 64 deletions(-) diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index fbe6d3984596..27ad753a0c24 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -22,7 +22,7 @@ use crate::data_type::Int96; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ColumnChunkMetaData; use crate::file::page_index::index::{Index, NativeIndex}; -use crate::file::page_index::offset_index::OffsetIndexMetaData; +use crate::file::page_index::offset_index::{read_offset_index, OffsetIndexMetaData}; use crate::file::reader::ChunkReader; use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; use crate::thrift_struct; @@ -131,7 +131,16 @@ pub fn read_offset_indexes( pub(crate) fn decode_offset_index(data: &[u8]) -> Result { let mut prot = ThriftCompactInputProtocol::new(data); - OffsetIndexMetaData::try_from(&mut prot) + + // Try to read fast-path index first. If that fails, fall back to slower but more robust + // reader + match read_offset_index(&mut prot) { + Ok(offset_index) => Ok(offset_index), + Err(_) => { + prot = ThriftCompactInputProtocol::new(data); + OffsetIndexMetaData::try_from(&mut prot) + } + } } thrift_struct!( diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 791f61d37eae..2ec51c5d6ee4 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -25,7 +25,7 @@ use crate::{ thrift_struct, }; -/*thrift_struct!( +thrift_struct!( /// Page location information for [`OffsetIndexMetaData`] pub struct PageLocation { /// Offset of the page in the file @@ -37,67 +37,7 @@ pub struct PageLocation { /// (repetition_level = 0). 3: required i64 first_row_index } -);*/ - -// hand coding this one because it is very time critical - -/// Page location information for [`OffsetIndexMetaData`] -#[derive(Clone, Debug, Eq, PartialEq)] -pub struct PageLocation { - /// Offset of the page in the file - pub offset: i64, - /// Size of the page, including header. Sum of compressed_page_size and header - pub compressed_page_size: i32, - /// Index within the RowGroup of the first row of the page. When an - /// OffsetIndex is present, pages must begin on row boundaries - /// (repetition_level = 0). - pub first_row_index: i64, -} - -// Note: this will fail if the fields are either out of order, or if a suboptimal -// encoder doesn't use field deltas. If that ever occurs, remove this code and -// revert to the commented out thrift_struct!() implementation above. -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for PageLocation { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { - // there are 3 fields, all mandatory, so all field deltas should be 1 - let (field_type, delta) = prot.read_field_header()?; - if delta != 1 || field_type != FieldType::I64 as u8 { - return Err(general_err!("error reading PageLocation::offset")); - } - let offset = prot.read_i64()?; - - let (field_type, delta) = prot.read_field_header()?; - if delta != 1 || field_type != FieldType::I32 as u8 { - return Err(general_err!( - "error reading PageLocation::compressed_page_size" - )); - } - let compressed_page_size = prot.read_i32()?; - - let (field_type, delta) = prot.read_field_header()?; - if delta != 1 || field_type != FieldType::I64 as u8 { - return Err(general_err!("error reading PageLocation::first_row_index")); - } - let first_row_index = prot.read_i64()?; - - // This loop slows things down a bit, but it's an acceptible price to allow - // forwards compatibility. We could instead assert the next field is Stop. - loop { - let (field_type, _) = prot.read_field_header()?; - if field_type == FieldType::Stop as u8 { - break; - } - prot.skip(FieldType::try_from(field_type)?)?; - } - - Ok(Self { - offset, - compressed_page_size, - first_row_index, - }) - } -} +); impl From<&crate::format::PageLocation> for PageLocation { fn from(value: &crate::format::PageLocation) -> Self { @@ -165,3 +105,95 @@ impl OffsetIndexMetaData { ) } } + +// hand coding this one because it is very time critical + +// Note: this will fail if the fields are either out of order, or if a suboptimal +// encoder doesn't use field deltas. +fn read_page_location<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { + // there are 3 fields, all mandatory, so all field deltas should be 1 + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::I64 as u8 { + return Err(general_err!("error reading PageLocation::offset")); + } + let offset = prot.read_i64()?; + + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::I32 as u8 { + return Err(general_err!( + "error reading PageLocation::compressed_page_size" + )); + } + let compressed_page_size = prot.read_i32()?; + + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::I64 as u8 { + return Err(general_err!("error reading PageLocation::first_row_index")); + } + let first_row_index = prot.read_i64()?; + + // read end of struct...return error if there are unknown fields present + let (field_type, _) = prot.read_field_header()?; + if field_type != FieldType::Stop as u8 { + return Err(general_err!("unexpected field in PageLocation")); + } + + Ok(PageLocation { + offset, + compressed_page_size, + first_row_index, + }) +} + +// Fast-path read of offset index. this all works because we expect all field deltas to be 1, +// and there's no nesting beyond PageLocation, so no need to save the last field id. Like +// read_page_locations(), this will fail if absolute field id's are used. +pub(super) fn read_offset_index<'a>( + prot: &mut ThriftCompactInputProtocol<'a>, +) -> Result { + // Offset index is a struct with 2 fields. First field is an array of PageLocations, + // the second an optional array of i64. + + // read field 1 header, then list header, then vec of PageLocations + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::List as u8 { + return Err(general_err!("error reading OffsetIndex::page_locations")); + } + let list_ident = prot.read_list_begin()?; + let mut page_locations = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + page_locations.push(read_page_location(prot)?); + } + + let mut unencoded_byte_array_data_bytes: Option> = None; + + // read second field...if it's Stop we're done + let (mut field_type, delta) = prot.read_field_header()?; + if field_type == FieldType::List as u8 { + if delta != 1 { + return Err(general_err!( + "encountered unknown field while reading OffsetIndex" + )); + } + let list_ident = prot.read_list_begin()?; + let mut vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + vec.push(prot.read_i64()?); + } + unencoded_byte_array_data_bytes = Some(vec); + + // this one should be Stop + (field_type, _) = prot.read_field_header()?; + } + + if field_type != FieldType::Stop as u8 { + return Err(general_err!( + "encountered unknown field while reading OffsetIndex" + )); + } + + Ok(OffsetIndexMetaData { + page_locations, + unencoded_byte_array_data_bytes, + }) +} From 09d71e179ffe4273f95ae6493d260d22a5346c22 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sat, 23 Aug 2025 18:50:36 -0700 Subject: [PATCH 017/126] refactor a bit --- parquet/src/file/page_index/index_reader.rs | 8 +- parquet/src/file/page_index/offset_index.rs | 104 ++++++++++---------- 2 files changed, 55 insertions(+), 57 deletions(-) diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 27ad753a0c24..fb3519b5cbb2 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -22,7 +22,7 @@ use crate::data_type::Int96; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ColumnChunkMetaData; use crate::file::page_index::index::{Index, NativeIndex}; -use crate::file::page_index::offset_index::{read_offset_index, OffsetIndexMetaData}; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::ChunkReader; use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; use crate::thrift_struct; @@ -132,9 +132,9 @@ pub fn read_offset_indexes( pub(crate) fn decode_offset_index(data: &[u8]) -> Result { let mut prot = ThriftCompactInputProtocol::new(data); - // Try to read fast-path index first. If that fails, fall back to slower but more robust - // reader - match read_offset_index(&mut prot) { + // Try to read fast-path first. If that fails, fall back to slower but more robust + // decoder. + match OffsetIndexMetaData::try_from_fast(&mut prot) { Ok(offset_index) => Ok(offset_index), Err(_) => { prot = ThriftCompactInputProtocol::new(data); diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 2ec51c5d6ee4..d6baa1e44892 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -104,6 +104,57 @@ impl OffsetIndexMetaData { self.unencoded_byte_array_data_bytes.clone(), ) } + + // Fast-path read of offset index. This works because we expect all field deltas to be 1, + // and there's no nesting beyond PageLocation, so no need to save the last field id. Like + // read_page_locations(), this will fail if absolute field id's are used. + pub(super) fn try_from_fast<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { + // Offset index is a struct with 2 fields. First field is an array of PageLocations, + // the second an optional array of i64. + + // read field 1 header, then list header, then vec of PageLocations + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::List as u8 { + return Err(general_err!("error reading OffsetIndex::page_locations")); + } + let list_ident = prot.read_list_begin()?; + let mut page_locations = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + page_locations.push(read_page_location(prot)?); + } + + let mut unencoded_byte_array_data_bytes: Option> = None; + + // read second field...if it's Stop we're done + let (mut field_type, delta) = prot.read_field_header()?; + if field_type == FieldType::List as u8 { + if delta != 1 { + return Err(general_err!( + "encountered unknown field while reading OffsetIndex" + )); + } + let list_ident = prot.read_list_begin()?; + let mut vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + vec.push(prot.read_i64()?); + } + unencoded_byte_array_data_bytes = Some(vec); + + // this one should be Stop + (field_type, _) = prot.read_field_header()?; + } + + if field_type != FieldType::Stop as u8 { + return Err(general_err!( + "encountered unknown field while reading OffsetIndex" + )); + } + + Ok(Self { + page_locations, + unencoded_byte_array_data_bytes, + }) + } } // hand coding this one because it is very time critical @@ -144,56 +195,3 @@ fn read_page_location<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result

( - prot: &mut ThriftCompactInputProtocol<'a>, -) -> Result { - // Offset index is a struct with 2 fields. First field is an array of PageLocations, - // the second an optional array of i64. - - // read field 1 header, then list header, then vec of PageLocations - let (field_type, delta) = prot.read_field_header()?; - if delta != 1 || field_type != FieldType::List as u8 { - return Err(general_err!("error reading OffsetIndex::page_locations")); - } - let list_ident = prot.read_list_begin()?; - let mut page_locations = Vec::with_capacity(list_ident.size as usize); - for _ in 0..list_ident.size { - page_locations.push(read_page_location(prot)?); - } - - let mut unencoded_byte_array_data_bytes: Option> = None; - - // read second field...if it's Stop we're done - let (mut field_type, delta) = prot.read_field_header()?; - if field_type == FieldType::List as u8 { - if delta != 1 { - return Err(general_err!( - "encountered unknown field while reading OffsetIndex" - )); - } - let list_ident = prot.read_list_begin()?; - let mut vec = Vec::with_capacity(list_ident.size as usize); - for _ in 0..list_ident.size { - vec.push(prot.read_i64()?); - } - unencoded_byte_array_data_bytes = Some(vec); - - // this one should be Stop - (field_type, _) = prot.read_field_header()?; - } - - if field_type != FieldType::Stop as u8 { - return Err(general_err!( - "encountered unknown field while reading OffsetIndex" - )); - } - - Ok(OffsetIndexMetaData { - page_locations, - unencoded_byte_array_data_bytes, - }) -} From 1ddaa35b7b7deb8779c84af1da4a8a77fb790aaa Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sat, 23 Aug 2025 19:17:25 -0700 Subject: [PATCH 018/126] simplify reading of int array --- parquet/src/file/page_index/offset_index.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index d6baa1e44892..6cb7539cb573 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -117,6 +117,8 @@ impl OffsetIndexMetaData { if delta != 1 || field_type != FieldType::List as u8 { return Err(general_err!("error reading OffsetIndex::page_locations")); } + + // we have to do this manually because we want to use the fast PageLocation decoder let list_ident = prot.read_list_begin()?; let mut page_locations = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { @@ -133,11 +135,7 @@ impl OffsetIndexMetaData { "encountered unknown field while reading OffsetIndex" )); } - let list_ident = prot.read_list_begin()?; - let mut vec = Vec::with_capacity(list_ident.size as usize); - for _ in 0..list_ident.size { - vec.push(prot.read_i64()?); - } + let vec = Vec::::try_from(&mut *prot)?; unencoded_byte_array_data_bytes = Some(vec); // this one should be Stop From c271085fb1153ce0617df2148fe95fda6065853e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 08:48:04 -0700 Subject: [PATCH 019/126] get write working for enum and some unions --- parquet/src/basic.rs | 5 +- parquet/src/parquet_macros.rs | 18 ++++ parquet/src/parquet_thrift.rs | 149 +++++++++++++++++++++++++++++++++- 3 files changed, 170 insertions(+), 2 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index c325cf5dbf2b..79891822a242 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -20,11 +20,14 @@ //! Refer to [`parquet.thrift`](https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift) //! file to see raw definitions. +use std::io::Write; use std::str::FromStr; use std::{fmt, str}; pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel}; -use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; +use crate::parquet_thrift::{ + FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, +}; use crate::{thrift_enum, thrift_struct, thrift_union_all_empty}; use crate::errors::{ParquetError, Result}; diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 2d1ccd819b37..9a8a9ae4e4f9 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -51,6 +51,12 @@ macro_rules! thrift_enum { } } + impl WriteThrift for $identifier { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + (*self as i32).write_thrift(writer) + } + } + // TODO: remove when we finally get rid of the format module impl TryFrom for $identifier { type Error = ParquetError; @@ -119,6 +125,18 @@ macro_rules! thrift_union_all_empty { } } + impl WriteThrift for $identifier { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + match *self { + $(Self::$field_name => writer.write_field_begin(FieldType::Struct, $field_id, 0)?,)* + } + // write end of struct for empty struct + writer.write_struct_end()?; + // write end of struct for this union + writer.write_struct_end() + } + } + // TODO: remove when we finally get rid of the format module impl From for $identifier { fn from(value: crate::format::$identifier) -> Self { diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 2dff498372f0..f156eed31bb4 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -20,7 +20,7 @@ // to not allocate byte arrays or strings. #![allow(dead_code)] -use std::cmp::Ordering; +use std::{cmp::Ordering, io::Write}; use crate::errors::{ParquetError, Result}; @@ -539,3 +539,150 @@ where Ok(res) } } + +///////////////////////// +// thrift compact output + +pub(crate) struct ThriftCompactOutputProtocol { + writer: W, +} + +impl ThriftCompactOutputProtocol { + pub(crate) fn new(writer: W) -> Self { + Self { writer } + } + + pub(crate) fn inner(&self) -> &W { + &self.writer + } + + fn write_byte(&mut self, b: u8) -> Result<()> { + self.writer.write_all(&[b])?; + Ok(()) + } + + fn write_vlq(&mut self, val: u64) -> Result<()> { + let mut v = val; + while v > 0x7f { + self.write_byte(v as u8 | 0x80)?; + v >>= 7; + } + self.write_byte(v as u8) + } + + fn write_zig_zag(&mut self, val: i64) -> Result<()> { + let s = (val < 0) as i64; + self.write_vlq((((val ^ -s) << 1) + s) as u64) + } + + pub(crate) fn write_field_begin( + &mut self, + field_type: FieldType, + field_id: i16, + last_field_id: i16, + ) -> Result<()> { + let mut delta = field_id - last_field_id; + if delta > 0xf || delta < 0 { + delta = 0; + } + if delta > 0 { + self.write_byte((delta as u8) << 4 | field_type as u8) + } else { + self.write_byte(field_type as u8)?; + self.write_i16(delta) + } + } + + pub(crate) fn write_struct_end(&mut self) -> Result<()> { + self.write_byte(0) + } + + pub(crate) fn write_i8(&mut self, val: i8) -> Result<()> { + self.write_byte(val as u8) + } + + pub(crate) fn write_i16(&mut self, val: i16) -> Result<()> { + self.write_zig_zag(val as _) + } + + pub(crate) fn write_i32(&mut self, val: i32) -> Result<()> { + self.write_zig_zag(val as _) + } + + pub(crate) fn write_i64(&mut self, val: i64) -> Result<()> { + self.write_zig_zag(val as _) + } +} + +pub(crate) trait WriteThrift { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()>; +} + +impl WriteThrift for i8 { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_i8(*self) + } +} + +impl WriteThrift for i16 { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_i16(*self) + } +} + +impl WriteThrift for i32 { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_i32(*self) + } +} + +impl WriteThrift for i64 { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_i64(*self) + } +} + +#[cfg(test)] +#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module +mod tests { + use crate::basic::{TimeUnit, Type}; + + use super::*; + use std::fmt::Debug; + + fn test_roundtrip(val: T) + where + T: for<'a> TryFrom<&'a mut ThriftCompactInputProtocol<'a>> + + WriteThrift> + + PartialEq + + Debug, + for<'a> >>::Error: Debug, + { + let buf = Vec::::new(); + let mut writer = ThriftCompactOutputProtocol::new(buf); + val.write_thrift(&mut writer).unwrap(); + + let mut prot = ThriftCompactInputProtocol::new(writer.inner()); + let read_val = T::try_from(&mut prot).unwrap(); + assert_eq!(val, read_val); + } + + #[test] + fn test_enum_roundtrip() { + test_roundtrip(Type::BOOLEAN); + test_roundtrip(Type::INT32); + test_roundtrip(Type::INT64); + test_roundtrip(Type::INT96); + test_roundtrip(Type::FLOAT); + test_roundtrip(Type::DOUBLE); + test_roundtrip(Type::BYTE_ARRAY); + test_roundtrip(Type::FIXED_LEN_BYTE_ARRAY); + } + + #[test] + fn test_union_all_empty_roundtrip() { + test_roundtrip(TimeUnit::MILLIS); + test_roundtrip(TimeUnit::MICROS); + test_roundtrip(TimeUnit::NANOS); + } +} From 34cdaf2df90e5995b029fcd8acbd6123d2f8fdce Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 10:25:33 -0700 Subject: [PATCH 020/126] make test_roundtrip visible --- parquet/src/parquet_thrift.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index f156eed31bb4..fef0b2faecda 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -644,13 +644,13 @@ impl WriteThrift for i64 { #[cfg(test)] #[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module -mod tests { +pub(crate) mod tests { use crate::basic::{TimeUnit, Type}; use super::*; use std::fmt::Debug; - fn test_roundtrip(val: T) + pub(crate) fn test_roundtrip(val: T) where T: for<'a> TryFrom<&'a mut ThriftCompactInputProtocol<'a>> + WriteThrift> From c9be57047d8095442e7248be0a49ef98e9ce5f3f Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 10:28:02 -0700 Subject: [PATCH 021/126] add test for converted_type, start on logical_type --- parquet/src/basic.rs | 83 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 79891822a242..788a38743db9 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -196,6 +196,13 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ConvertedType { } } +impl WriteThrift for ConvertedType { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + // because we've added NONE, the variant values are off by 1, so correct that here + writer.write_i32(*self as i32 - 1) + } +} + // ---------------------------------------------------------------------- // Mirrors thrift union `crate::format::TimeUnit` @@ -453,6 +460,35 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType { } } +impl WriteThrift for LogicalType { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + match *self { + Self::String => { + writer.write_field_begin(FieldType::Struct, 1, 0)?; + writer.write_struct_end()?; + } + Self::Map => { + writer.write_field_begin(FieldType::Struct, 2, 0)?; + writer.write_struct_end()?; + } + Self::List => { + writer.write_field_begin(FieldType::Struct, 3, 0)?; + writer.write_struct_end()?; + } + Self::Enum => { + writer.write_field_begin(FieldType::Struct, 4, 0)?; + writer.write_struct_end()?; + } + Self::Decimal { scale, precision } => { + writer.write_field_begin(FieldType::Struct, 4, 0)?; + DecimalType { scale, precision }.write_thrift(writer)?; + } + _ => return Err(nyi_err!("logical type")), + } + writer.write_struct_end() + } +} + // ---------------------------------------------------------------------- // Mirrors thrift enum `crate::format::FieldRepetitionType` // @@ -996,6 +1032,20 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ColumnOrder { } } +impl WriteThrift for ColumnOrder { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + match *self { + Self::TYPE_DEFINED_ORDER(_) => { + writer.write_field_begin(FieldType::Struct, 1, 0)?; + writer.write_struct_end()?; + } + _ => return Err(general_err!("Attempt to write undefined ColumnOrder")), + } + // write end of struct for this union + writer.write_struct_end() + } +} + // ---------------------------------------------------------------------- // Display handlers @@ -1445,6 +1495,7 @@ impl str::FromStr for LogicalType { #[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module mod tests { use super::*; + use crate::parquet_thrift::tests::test_roundtrip; #[test] fn test_display_type() { @@ -1552,6 +1603,32 @@ mod tests { ); } + #[test] + fn test_converted_type_roundtrip() { + test_roundtrip(ConvertedType::UTF8); + test_roundtrip(ConvertedType::MAP); + test_roundtrip(ConvertedType::MAP_KEY_VALUE); + test_roundtrip(ConvertedType::LIST); + test_roundtrip(ConvertedType::ENUM); + test_roundtrip(ConvertedType::DECIMAL); + test_roundtrip(ConvertedType::DATE); + test_roundtrip(ConvertedType::TIME_MILLIS); + test_roundtrip(ConvertedType::TIME_MICROS); + test_roundtrip(ConvertedType::TIMESTAMP_MILLIS); + test_roundtrip(ConvertedType::TIMESTAMP_MICROS); + test_roundtrip(ConvertedType::UINT_8); + test_roundtrip(ConvertedType::UINT_16); + test_roundtrip(ConvertedType::UINT_32); + test_roundtrip(ConvertedType::UINT_64); + test_roundtrip(ConvertedType::INT_8); + test_roundtrip(ConvertedType::INT_16); + test_roundtrip(ConvertedType::INT_32); + test_roundtrip(ConvertedType::INT_64); + test_roundtrip(ConvertedType::JSON); + test_roundtrip(ConvertedType::BSON); + test_roundtrip(ConvertedType::INTERVAL); + } + #[test] fn test_display_converted_type() { assert_eq!(ConvertedType::NONE.to_string(), "NONE"); @@ -2414,6 +2491,12 @@ mod tests { assert_eq!(ColumnOrder::UNDEFINED.to_string(), "UNDEFINED"); } + #[test] + fn test_column_order_roundtrip() { + // SortOrder::SIGNED is the default on read. + test_roundtrip(ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED)) + } + #[test] fn test_column_order_get_logical_type_sort_order() { // Helper to check the order in a list of values. From a9cd09dc49b62dd763043d110265cda82ee10e66 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 12:26:47 -0700 Subject: [PATCH 022/126] checkpoint struct field writing --- parquet/src/basic.rs | 262 +++++++++++++++++++++++++++++++++- parquet/src/parquet_macros.rs | 18 ++- parquet/src/parquet_thrift.rs | 106 ++++++++++++-- 3 files changed, 370 insertions(+), 16 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 788a38743db9..0371cc638b8f 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -27,6 +27,7 @@ use std::{fmt, str}; pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel}; use crate::parquet_thrift::{ FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, + WriteThriftField, }; use crate::{thrift_enum, thrift_struct, thrift_union_all_empty}; @@ -227,6 +228,31 @@ struct DecimalType { } ); +impl WriteThrift for DecimalType { + #[allow(unused_assignments)] + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let mut last_field_id = 0i16; + last_field_id = self.scale.write_thrift_field(writer, 1, last_field_id)?; + last_field_id = self + .precision + .write_thrift_field(writer, 2, last_field_id)?; + writer.write_struct_end() + } +} + +impl WriteThriftField for DecimalType { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + thrift_struct!( struct TimestampType { 1: required bool is_adjusted_to_u_t_c @@ -234,6 +260,31 @@ struct TimestampType { } ); +impl WriteThrift for TimestampType { + #[allow(unused_assignments)] + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let mut last_field_id = 0i16; + last_field_id = self + .is_adjusted_to_u_t_c + .write_thrift_field(writer, 1, last_field_id)?; + last_field_id = self.unit.write_thrift_field(writer, 2, last_field_id)?; + writer.write_struct_end() + } +} + +impl WriteThriftField for TimestampType { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + // they are identical use TimestampType as TimeType; @@ -244,6 +295,33 @@ struct IntType { } ); +impl WriteThrift for IntType { + #[allow(unused_assignments)] + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let mut last_field_id = 0i16; + last_field_id = self + .bit_width + .write_thrift_field(writer, 1, last_field_id)?; + last_field_id = self + .is_signed + .write_thrift_field(writer, 2, last_field_id)?; + writer.write_struct_end() + } +} + +impl WriteThriftField for IntType { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + thrift_struct!( struct VariantType { // The version of the variant specification that the variant was @@ -252,12 +330,66 @@ struct VariantType { } ); +impl WriteThrift for VariantType { + #[allow(unused_assignments)] + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let mut last_field_id = 0i16; + if self.specification_version.is_some() { + last_field_id = + self.specification_version + .unwrap() + .write_thrift_field(writer, 1, last_field_id)?; + } + writer.write_struct_end() + } +} + +impl WriteThriftField for VariantType { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + thrift_struct!( struct GeometryType<'a> { 1: optional string<'a> crs; } ); +impl<'a, W: Write> WriteThrift for GeometryType<'a> { + #[allow(unused_assignments)] + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let mut last_field_id = 0i16; + if self.crs.is_some() { + last_field_id = self + .crs + .unwrap() + .write_thrift_field(writer, 1, last_field_id)?; + } + writer.write_struct_end() + } +} + +impl<'a, W: Write> WriteThriftField for GeometryType<'a> { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + thrift_struct!( struct GeographyType<'a> { 1: optional string<'a> crs; @@ -265,6 +397,40 @@ struct GeographyType<'a> { } ); +impl<'a, W: Write> WriteThrift for GeographyType<'a> { + #[allow(unused_assignments)] + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let mut last_field_id = 0i16; + if self.crs.is_some() { + last_field_id = self + .crs + .unwrap() + .write_thrift_field(writer, 1, last_field_id)?; + } + if self.algorithm.is_some() { + last_field_id = + self.algorithm + .as_ref() + .unwrap() + .write_thrift_field(writer, 2, last_field_id)?; + } + writer.write_struct_end() + } +} + +impl<'a, W: Write> WriteThriftField for GeographyType<'a> { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + /// Logical types used by version 2.4.0+ of the Parquet format. /// /// This is an *entirely new* struct as of version @@ -462,7 +628,7 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType { impl WriteThrift for LogicalType { fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - match *self { + match self { Self::String => { writer.write_field_begin(FieldType::Struct, 1, 0)?; writer.write_struct_end()?; @@ -480,8 +646,86 @@ impl WriteThrift for LogicalType { writer.write_struct_end()?; } Self::Decimal { scale, precision } => { - writer.write_field_begin(FieldType::Struct, 4, 0)?; - DecimalType { scale, precision }.write_thrift(writer)?; + DecimalType { + scale: *scale, + precision: *precision, + } + .write_thrift_field(writer, 5, 0)?; + } + Self::Date => { + writer.write_field_begin(FieldType::Struct, 6, 0)?; + writer.write_struct_end()?; + } + Self::Time { + is_adjusted_to_u_t_c, + unit, + } => { + TimeType { + is_adjusted_to_u_t_c: *is_adjusted_to_u_t_c, + unit: *unit, + } + .write_thrift_field(writer, 7, 0)?; + } + Self::Timestamp { + is_adjusted_to_u_t_c, + unit, + } => { + TimestampType { + is_adjusted_to_u_t_c: *is_adjusted_to_u_t_c, + unit: *unit, + } + .write_thrift_field(writer, 8, 0)?; + } + Self::Integer { + bit_width, + is_signed, + } => { + IntType { + bit_width: *bit_width, + is_signed: *is_signed, + } + .write_thrift_field(writer, 10, 0)?; + } + Self::Unknown => { + writer.write_field_begin(FieldType::Struct, 11, 0)?; + writer.write_struct_end()?; + } + Self::Json => { + writer.write_field_begin(FieldType::Struct, 12, 0)?; + writer.write_struct_end()?; + } + Self::Bson => { + writer.write_field_begin(FieldType::Struct, 13, 0)?; + writer.write_struct_end()?; + } + Self::Uuid => { + writer.write_field_begin(FieldType::Struct, 14, 0)?; + writer.write_struct_end()?; + } + Self::Float16 => { + writer.write_field_begin(FieldType::Struct, 15, 0)?; + writer.write_struct_end()?; + } + Self::Variant { + specification_version, + } => { + VariantType { + specification_version: *specification_version, + } + .write_thrift_field(writer, 16, 0)?; + } + Self::Geometry { crs } => { + GeometryType { + crs: crs.as_ref().map(|s| s.as_str()), + } + .write_thrift_field(writer, 17, 0)?; + } + Self::Geography { crs, algorithm } => { + GeographyType { + crs: crs.as_ref().map(|s| s.as_str()), + algorithm: *algorithm, + } + .write_thrift_field(writer, 18, 0)?; } _ => return Err(nyi_err!("logical type")), } @@ -2186,6 +2430,18 @@ mod tests { ); } + #[test] + fn test_logical_type_roundtrip() { + test_roundtrip(LogicalType::String); + test_roundtrip(LogicalType::Map); + test_roundtrip(LogicalType::List); + test_roundtrip(LogicalType::Enum); + test_roundtrip(LogicalType::Decimal { + scale: 0, + precision: 20, + }); + } + #[test] fn test_display_repetition() { assert_eq!(Repetition::REQUIRED.to_string(), "REQUIRED"); diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 9a8a9ae4e4f9..e6d35dedca01 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -53,7 +53,15 @@ macro_rules! thrift_enum { impl WriteThrift for $identifier { fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - (*self as i32).write_thrift(writer) + writer.write_i32(*self as i32) + } + } + + impl WriteThriftField for $identifier { + fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { + writer.write_field_begin(FieldType::I32, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) } } @@ -137,6 +145,14 @@ macro_rules! thrift_union_all_empty { } } + impl WriteThriftField for $identifier { + fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } + } + // TODO: remove when we finally get rid of the format module impl From for $identifier { fn from(value: crate::format::$identifier) -> Self { diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index fef0b2faecda..d332565a2f59 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -597,6 +597,19 @@ impl ThriftCompactOutputProtocol { self.write_byte(0) } + pub(crate) fn write_bytes(&mut self, val: &[u8]) -> Result<()> { + self.write_vlq(val.len() as u64)?; + self.writer.write_all(val)?; + Ok(()) + } + + pub(crate) fn write_bool(&mut self, val: bool) -> Result<()> { + match val { + true => self.write_byte(1), + false => self.write_byte(2), + } + } + pub(crate) fn write_i8(&mut self, val: i8) -> Result<()> { self.write_byte(val as u8) } @@ -615,30 +628,99 @@ impl ThriftCompactOutputProtocol { } pub(crate) trait WriteThrift { + // used to write generated enums and structs fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()>; } -impl WriteThrift for i8 { - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - writer.write_i8(*self) +pub(crate) trait WriteThriftField { + // used to write struct fields (which may be basic types or generated types). + // write the field header and field value. returns `field_id`. + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result; +} + +impl WriteThriftField for bool { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + // boolean only writes the field header + match *self { + true => writer.write_field_begin(FieldType::BooleanTrue, field_id, last_field_id)?, + false => writer.write_field_begin(FieldType::BooleanFalse, field_id, last_field_id)?, + } + Ok(field_id) } } -impl WriteThrift for i16 { - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - writer.write_i16(*self) +impl WriteThriftField for i8 { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Byte, field_id, last_field_id)?; + writer.write_i8(*self)?; + Ok(field_id) } } -impl WriteThrift for i32 { - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - writer.write_i32(*self) +impl WriteThriftField for i16 { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::I16, field_id, last_field_id)?; + writer.write_i16(*self)?; + Ok(field_id) + } +} + +impl WriteThriftField for i32 { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::I32, field_id, last_field_id)?; + writer.write_i32(*self)?; + Ok(field_id) } } -impl WriteThrift for i64 { - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - writer.write_i64(*self) +impl WriteThriftField for i64 { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::I64, field_id, last_field_id)?; + writer.write_i64(*self)?; + Ok(field_id) + } +} + +impl WriteThriftField for &str { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Binary, field_id, last_field_id)?; + writer.write_bytes(self.as_bytes())?; + Ok(field_id) } } From ae65167a8862fd1259cbbb6569373f967110e29b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 15:03:37 -0700 Subject: [PATCH 023/126] get some struct examples and lists working --- parquet/src/basic.rs | 91 +++++++++++++++++++- parquet/src/file/page_index/offset_index.rs | 80 +++++++++++++++++- parquet/src/parquet_macros.rs | 4 + parquet/src/parquet_thrift.rs | 93 +++++++++++++++++++-- 4 files changed, 260 insertions(+), 8 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 0371cc638b8f..50b920401646 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -26,7 +26,7 @@ use std::{fmt, str}; pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel}; use crate::parquet_thrift::{ - FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, + ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, }; use crate::{thrift_enum, thrift_struct, thrift_union_all_empty}; @@ -198,6 +198,8 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ConvertedType { } impl WriteThrift for ConvertedType { + const ELEMENT_TYPE: ElementType = ElementType::I32; + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { // because we've added NONE, the variant values are off by 1, so correct that here writer.write_i32(*self as i32 - 1) @@ -229,6 +231,8 @@ struct DecimalType { ); impl WriteThrift for DecimalType { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + #[allow(unused_assignments)] fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; @@ -261,6 +265,8 @@ struct TimestampType { ); impl WriteThrift for TimestampType { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + #[allow(unused_assignments)] fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; @@ -296,6 +302,8 @@ struct IntType { ); impl WriteThrift for IntType { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + #[allow(unused_assignments)] fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; @@ -331,6 +339,8 @@ struct VariantType { ); impl WriteThrift for VariantType { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + #[allow(unused_assignments)] fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; @@ -364,6 +374,8 @@ struct GeometryType<'a> { ); impl<'a, W: Write> WriteThrift for GeometryType<'a> { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + #[allow(unused_assignments)] fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; @@ -398,6 +410,8 @@ struct GeographyType<'a> { ); impl<'a, W: Write> WriteThrift for GeographyType<'a> { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + #[allow(unused_assignments)] fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; @@ -627,6 +641,8 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType { } impl WriteThrift for LogicalType { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { match self { Self::String => { @@ -1277,6 +1293,8 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ColumnOrder { } impl WriteThrift for ColumnOrder { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { match *self { Self::TYPE_DEFINED_ORDER(_) => { @@ -2440,6 +2458,77 @@ mod tests { scale: 0, precision: 20, }); + test_roundtrip(LogicalType::Date); + test_roundtrip(LogicalType::Time { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::MICROS, + }); + test_roundtrip(LogicalType::Time { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MILLIS, + }); + test_roundtrip(LogicalType::Time { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::NANOS, + }); + test_roundtrip(LogicalType::Timestamp { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MICROS, + }); + test_roundtrip(LogicalType::Timestamp { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::MILLIS, + }); + test_roundtrip(LogicalType::Timestamp { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::NANOS, + }); + test_roundtrip(LogicalType::Integer { + bit_width: 8, + is_signed: true, + }); + test_roundtrip(LogicalType::Integer { + bit_width: 16, + is_signed: false, + }); + test_roundtrip(LogicalType::Integer { + bit_width: 32, + is_signed: true, + }); + test_roundtrip(LogicalType::Integer { + bit_width: 64, + is_signed: false, + }); + test_roundtrip(LogicalType::Json); + test_roundtrip(LogicalType::Bson); + test_roundtrip(LogicalType::Uuid); + test_roundtrip(LogicalType::Float16); + test_roundtrip(LogicalType::Variant { + specification_version: Some(1), + }); + test_roundtrip(LogicalType::Variant { + specification_version: None, + }); + test_roundtrip(LogicalType::Geometry { + crs: Some("foo".to_owned()), + }); + test_roundtrip(LogicalType::Geometry { crs: None }); + test_roundtrip(LogicalType::Geography { + crs: Some("foo".to_owned()), + algorithm: Some(EdgeInterpolationAlgorithm::ANDOYER), + }); + test_roundtrip(LogicalType::Geography { + crs: None, + algorithm: Some(EdgeInterpolationAlgorithm::KARNEY), + }); + test_roundtrip(LogicalType::Geography { + crs: Some("foo".to_owned()), + algorithm: None, + }); + test_roundtrip(LogicalType::Geography { + crs: None, + algorithm: None, + }); } #[test] diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 6cb7539cb573..8217fa7878c8 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -19,7 +19,12 @@ //! //! [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; +use std::io::Write; + +use crate::parquet_thrift::{ + ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, + WriteThriftField, +}; use crate::{ errors::{ParquetError, Result}, thrift_struct, @@ -39,6 +44,23 @@ pub struct PageLocation { } ); +impl WriteThrift for PageLocation { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + #[allow(unused_assignments)] + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let mut last_field_id = 0i16; + last_field_id = self.offset.write_thrift_field(writer, 1, last_field_id)?; + last_field_id = self + .compressed_page_size + .write_thrift_field(writer, 2, last_field_id)?; + last_field_id = self + .first_row_index + .write_thrift_field(writer, 3, last_field_id)?; + writer.write_struct_end() + } +} + impl From<&crate::format::PageLocation> for PageLocation { fn from(value: &crate::format::PageLocation) -> Self { Self { @@ -73,6 +95,29 @@ pub struct OffsetIndexMetaData { } ); +impl WriteThrift for OffsetIndexMetaData { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + #[allow(unused_assignments)] + fn write_thrift( + &self, + writer: &mut crate::parquet_thrift::ThriftCompactOutputProtocol, + ) -> Result<()> { + let mut last_field_id = 0i16; + last_field_id = self + .page_locations + .write_thrift_field(writer, 1, last_field_id)?; + if self.unencoded_byte_array_data_bytes.is_some() { + last_field_id = self + .unencoded_byte_array_data_bytes + .as_ref() + .unwrap() + .write_thrift_field(writer, 2, last_field_id)?; + } + writer.write_struct_end() + } +} + impl OffsetIndexMetaData { /// Creates a new [`OffsetIndexMetaData`] from an [`OffsetIndex`]. /// @@ -193,3 +238,36 @@ fn read_page_location<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result

WriteThrift for $identifier { + const ELEMENT_TYPE: ElementType = ElementType::I32; + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_i32(*self as i32) } @@ -134,6 +136,8 @@ macro_rules! thrift_union_all_empty { } impl WriteThrift for $identifier { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { match *self { $(Self::$field_name => writer.write_field_begin(FieldType::Struct, $field_id, 0)?,)* diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index d332565a2f59..80427ddf1359 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -581,15 +581,21 @@ impl ThriftCompactOutputProtocol { field_id: i16, last_field_id: i16, ) -> Result<()> { - let mut delta = field_id - last_field_id; - if delta > 0xf || delta < 0 { - delta = 0; - } - if delta > 0 { + let delta = field_id.wrapping_sub(last_field_id); + if delta > 0 && delta <= 0xf { self.write_byte((delta as u8) << 4 | field_type as u8) } else { self.write_byte(field_type as u8)?; - self.write_i16(delta) + self.write_i16(field_id) + } + } + + pub(crate) fn write_list_begin(&mut self, element_type: ElementType, len: usize) -> Result<()> { + if len < 15 { + self.write_byte((len as u8) << 4 | element_type as u8) + } else { + self.write_byte(0xf0u8 | element_type as u8)?; + self.write_vlq(len as _) } } @@ -628,10 +634,67 @@ impl ThriftCompactOutputProtocol { } pub(crate) trait WriteThrift { + const ELEMENT_TYPE: ElementType; + // used to write generated enums and structs fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()>; } +impl WriteThrift for Vec +where + T: WriteThrift, +{ + const ELEMENT_TYPE: ElementType = ElementType::List; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_list_begin(T::ELEMENT_TYPE, self.len())?; + for i in 0..self.len() { + self[i].write_thrift(writer)?; + } + Ok(()) + } +} + +impl WriteThrift for bool { + const ELEMENT_TYPE: ElementType = ElementType::Bool; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_bool(*self) + } +} + +impl WriteThrift for i8 { + const ELEMENT_TYPE: ElementType = ElementType::Byte; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_i8(*self) + } +} + +impl WriteThrift for i16 { + const ELEMENT_TYPE: ElementType = ElementType::I16; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_i16(*self) + } +} + +impl WriteThrift for i32 { + const ELEMENT_TYPE: ElementType = ElementType::I32; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_i32(*self) + } +} + +impl WriteThrift for i64 { + const ELEMENT_TYPE: ElementType = ElementType::I64; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_i64(*self) + } +} + pub(crate) trait WriteThriftField { // used to write struct fields (which may be basic types or generated types). // write the field header and field value. returns `field_id`. @@ -724,6 +787,22 @@ impl WriteThriftField for &str { } } +impl WriteThriftField for Vec +where + T: WriteThrift, +{ + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::List, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + #[cfg(test)] #[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module pub(crate) mod tests { @@ -744,6 +823,8 @@ pub(crate) mod tests { let mut writer = ThriftCompactOutputProtocol::new(buf); val.write_thrift(&mut writer).unwrap(); + //println!("serialized: {:x?}", writer.inner()); + let mut prot = ThriftCompactInputProtocol::new(writer.inner()); let read_val = T::try_from(&mut prot).unwrap(); assert_eq!(val, read_val); From 272a013dd263c2b31555743f1fadad9b4221ccae Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 15:05:12 -0700 Subject: [PATCH 024/126] get rid of copied allow --- parquet/src/parquet_thrift.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 80427ddf1359..c8ff863c15a4 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -804,7 +804,6 @@ where } #[cfg(test)] -#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module pub(crate) mod tests { use crate::basic::{TimeUnit, Type}; From 632e17127bfa747a064ad9c7900287f4a8f56874 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 17:12:19 -0700 Subject: [PATCH 025/126] get writer macros for structs working --- parquet/src/basic.rs | 334 +++++--------------- parquet/src/file/column_crypto_metadata.rs | 37 ++- parquet/src/file/metadata/mod.rs | 6 +- parquet/src/file/metadata/thrift_gen.rs | 6 +- parquet/src/file/page_encoding_stats.rs | 7 +- parquet/src/file/page_index/index_reader.rs | 6 +- parquet/src/file/page_index/offset_index.rs | 8 +- parquet/src/parquet_macros.rs | 86 ++++- parquet/src/parquet_thrift.rs | 82 +++++ parquet/tests/arrow_reader/io/mod.rs | 6 +- 10 files changed, 305 insertions(+), 273 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 50b920401646..3d774861c2a8 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -206,6 +206,19 @@ impl WriteThrift for ConvertedType { } } +impl WriteThriftField for ConvertedType { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::I32, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + // ---------------------------------------------------------------------- // Mirrors thrift union `crate::format::TimeUnit` @@ -230,33 +243,6 @@ struct DecimalType { } ); -impl WriteThrift for DecimalType { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - #[allow(unused_assignments)] - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - let mut last_field_id = 0i16; - last_field_id = self.scale.write_thrift_field(writer, 1, last_field_id)?; - last_field_id = self - .precision - .write_thrift_field(writer, 2, last_field_id)?; - writer.write_struct_end() - } -} - -impl WriteThriftField for DecimalType { - fn write_thrift_field( - &self, - writer: &mut ThriftCompactOutputProtocol, - field_id: i16, - last_field_id: i16, - ) -> Result { - writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; - self.write_thrift(writer)?; - Ok(field_id) - } -} - thrift_struct!( struct TimestampType { 1: required bool is_adjusted_to_u_t_c @@ -264,33 +250,6 @@ struct TimestampType { } ); -impl WriteThrift for TimestampType { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - #[allow(unused_assignments)] - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - let mut last_field_id = 0i16; - last_field_id = self - .is_adjusted_to_u_t_c - .write_thrift_field(writer, 1, last_field_id)?; - last_field_id = self.unit.write_thrift_field(writer, 2, last_field_id)?; - writer.write_struct_end() - } -} - -impl WriteThriftField for TimestampType { - fn write_thrift_field( - &self, - writer: &mut ThriftCompactOutputProtocol, - field_id: i16, - last_field_id: i16, - ) -> Result { - writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; - self.write_thrift(writer)?; - Ok(field_id) - } -} - // they are identical use TimestampType as TimeType; @@ -301,35 +260,6 @@ struct IntType { } ); -impl WriteThrift for IntType { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - #[allow(unused_assignments)] - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - let mut last_field_id = 0i16; - last_field_id = self - .bit_width - .write_thrift_field(writer, 1, last_field_id)?; - last_field_id = self - .is_signed - .write_thrift_field(writer, 2, last_field_id)?; - writer.write_struct_end() - } -} - -impl WriteThriftField for IntType { - fn write_thrift_field( - &self, - writer: &mut ThriftCompactOutputProtocol, - field_id: i16, - last_field_id: i16, - ) -> Result { - writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; - self.write_thrift(writer)?; - Ok(field_id) - } -} - thrift_struct!( struct VariantType { // The version of the variant specification that the variant was @@ -338,70 +268,12 @@ struct VariantType { } ); -impl WriteThrift for VariantType { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - #[allow(unused_assignments)] - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - let mut last_field_id = 0i16; - if self.specification_version.is_some() { - last_field_id = - self.specification_version - .unwrap() - .write_thrift_field(writer, 1, last_field_id)?; - } - writer.write_struct_end() - } -} - -impl WriteThriftField for VariantType { - fn write_thrift_field( - &self, - writer: &mut ThriftCompactOutputProtocol, - field_id: i16, - last_field_id: i16, - ) -> Result { - writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; - self.write_thrift(writer)?; - Ok(field_id) - } -} - thrift_struct!( struct GeometryType<'a> { 1: optional string<'a> crs; } ); -impl<'a, W: Write> WriteThrift for GeometryType<'a> { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - #[allow(unused_assignments)] - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - let mut last_field_id = 0i16; - if self.crs.is_some() { - last_field_id = self - .crs - .unwrap() - .write_thrift_field(writer, 1, last_field_id)?; - } - writer.write_struct_end() - } -} - -impl<'a, W: Write> WriteThriftField for GeometryType<'a> { - fn write_thrift_field( - &self, - writer: &mut ThriftCompactOutputProtocol, - field_id: i16, - last_field_id: i16, - ) -> Result { - writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; - self.write_thrift(writer)?; - Ok(field_id) - } -} - thrift_struct!( struct GeographyType<'a> { 1: optional string<'a> crs; @@ -409,42 +281,6 @@ struct GeographyType<'a> { } ); -impl<'a, W: Write> WriteThrift for GeographyType<'a> { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - #[allow(unused_assignments)] - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - let mut last_field_id = 0i16; - if self.crs.is_some() { - last_field_id = self - .crs - .unwrap() - .write_thrift_field(writer, 1, last_field_id)?; - } - if self.algorithm.is_some() { - last_field_id = - self.algorithm - .as_ref() - .unwrap() - .write_thrift_field(writer, 2, last_field_id)?; - } - writer.write_struct_end() - } -} - -impl<'a, W: Write> WriteThriftField for GeographyType<'a> { - fn write_thrift_field( - &self, - writer: &mut ThriftCompactOutputProtocol, - field_id: i16, - last_field_id: i16, - ) -> Result { - writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; - self.write_thrift(writer)?; - Ok(field_id) - } -} - /// Logical types used by version 2.4.0+ of the Parquet format. /// /// This is an *entirely new* struct as of version @@ -646,20 +482,16 @@ impl WriteThrift for LogicalType { fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { match self { Self::String => { - writer.write_field_begin(FieldType::Struct, 1, 0)?; - writer.write_struct_end()?; + writer.write_empty_struct(1, 0)?; } Self::Map => { - writer.write_field_begin(FieldType::Struct, 2, 0)?; - writer.write_struct_end()?; + writer.write_empty_struct(2, 0)?; } Self::List => { - writer.write_field_begin(FieldType::Struct, 3, 0)?; - writer.write_struct_end()?; + writer.write_empty_struct(3, 0)?; } Self::Enum => { - writer.write_field_begin(FieldType::Struct, 4, 0)?; - writer.write_struct_end()?; + writer.write_empty_struct(4, 0)?; } Self::Decimal { scale, precision } => { DecimalType { @@ -669,8 +501,7 @@ impl WriteThrift for LogicalType { .write_thrift_field(writer, 5, 0)?; } Self::Date => { - writer.write_field_begin(FieldType::Struct, 6, 0)?; - writer.write_struct_end()?; + writer.write_empty_struct(6, 0)?; } Self::Time { is_adjusted_to_u_t_c, @@ -703,24 +534,19 @@ impl WriteThrift for LogicalType { .write_thrift_field(writer, 10, 0)?; } Self::Unknown => { - writer.write_field_begin(FieldType::Struct, 11, 0)?; - writer.write_struct_end()?; + writer.write_empty_struct(11, 0)?; } Self::Json => { - writer.write_field_begin(FieldType::Struct, 12, 0)?; - writer.write_struct_end()?; + writer.write_empty_struct(12, 0)?; } Self::Bson => { - writer.write_field_begin(FieldType::Struct, 13, 0)?; - writer.write_struct_end()?; + writer.write_empty_struct(13, 0)?; } Self::Uuid => { - writer.write_field_begin(FieldType::Struct, 14, 0)?; - writer.write_struct_end()?; + writer.write_empty_struct(14, 0)?; } Self::Float16 => { - writer.write_field_begin(FieldType::Struct, 15, 0)?; - writer.write_struct_end()?; + writer.write_empty_struct(15, 0)?; } Self::Variant { specification_version, @@ -749,35 +575,37 @@ impl WriteThrift for LogicalType { } } +impl WriteThriftField for LogicalType { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + // ---------------------------------------------------------------------- // Mirrors thrift enum `crate::format::FieldRepetitionType` // // Cannot use macro since the name is changed +thrift_enum!( /// Representation of field types in schema. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -#[allow(non_camel_case_types)] -pub enum Repetition { - /// Field is required (can not be null) and each record has exactly 1 value. - REQUIRED, - /// Field is optional (can be null) and each record has 0 or 1 values. - OPTIONAL, - /// Field is repeated and can contain 0 or more values. - REPEATED, +enum FieldRepetitionType { + /// This field is required (can not be null) and each row has exactly 1 value. + REQUIRED = 0; + /// The field is optional (can be null) and each row has 0 or 1 values. + OPTIONAL = 1; + /// The field is repeated and can contain 0 or more values. + REPEATED = 2; } +); -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Repetition { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { - let val = prot.read_i32()?; - Ok(match val { - 0 => Self::REQUIRED, - 1 => Self::OPTIONAL, - 2 => Self::REPEATED, - _ => return Err(general_err!("Unexpected FieldRepetitionType {}", val)), - }) - } -} +pub(crate) type Repetition = FieldRepetitionType; // ---------------------------------------------------------------------- // Mirrors thrift enum `crate::format::Encoding` @@ -945,6 +773,39 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Compression { } } +// FIXME +// ugh...why did we add compression level to some variants if we don't use them???? +impl WriteThrift for Compression { + const ELEMENT_TYPE: ElementType = ElementType::I32; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let id: i32 = match *self { + Self::UNCOMPRESSED => 0, + Self::SNAPPY => 1, + Self::GZIP(_) => 2, + Self::LZO => 3, + Self::BROTLI(_) => 4, + Self::LZ4 => 5, + Self::ZSTD(_) => 6, + Self::LZ4_RAW => 7, + }; + writer.write_i32(id) + } +} + +impl WriteThriftField for Compression { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::I32, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + impl Compression { /// Returns the codec type of this compression setting as a string, without the compression /// level. @@ -1317,12 +1178,6 @@ impl fmt::Display for ConvertedType { } } -impl fmt::Display for Repetition { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{self:?}") - } -} - impl fmt::Display for Compression { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{self:?}") @@ -1574,37 +1429,6 @@ impl From> for ConvertedType { } } -// ---------------------------------------------------------------------- -// crate::format::FieldRepetitionType <=> Repetition conversion - -impl TryFrom for Repetition { - type Error = ParquetError; - - fn try_from(value: crate::format::FieldRepetitionType) -> Result { - Ok(match value { - crate::format::FieldRepetitionType::REQUIRED => Repetition::REQUIRED, - crate::format::FieldRepetitionType::OPTIONAL => Repetition::OPTIONAL, - crate::format::FieldRepetitionType::REPEATED => Repetition::REPEATED, - _ => { - return Err(general_err!( - "unexpected parquet repetition type: {}", - value.0 - )) - } - }) - } -} - -impl From for crate::format::FieldRepetitionType { - fn from(value: Repetition) -> Self { - match value { - Repetition::REQUIRED => crate::format::FieldRepetitionType::REQUIRED, - Repetition::OPTIONAL => crate::format::FieldRepetitionType::OPTIONAL, - Repetition::REPEATED => crate::format::FieldRepetitionType::REPEATED, - } - } -} - // ---------------------------------------------------------------------- // crate::format::CompressionCodec <=> Compression conversion diff --git a/parquet/src/file/column_crypto_metadata.rs b/parquet/src/file/column_crypto_metadata.rs index 95cbc65cf716..ec2e91beaa54 100644 --- a/parquet/src/file/column_crypto_metadata.rs +++ b/parquet/src/file/column_crypto_metadata.rs @@ -17,13 +17,18 @@ //! Column chunk encryption metadata +use std::io::Write; + use crate::errors::{ParquetError, Result}; use crate::format::{ ColumnCryptoMetaData as TColumnCryptoMetaData, EncryptionWithColumnKey as TEncryptionWithColumnKey, EncryptionWithFooterKey as TEncryptionWithFooterKey, }; -use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; +use crate::parquet_thrift::{ + ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, + WriteThriftField, +}; use crate::{thrift_struct, thrift_union}; // define this and ColumnCryptoMetadata here so they're only defined when @@ -48,6 +53,36 @@ union ColumnCryptoMetaData { } ); +// TODO: need to get this into the thrift_union macro +impl WriteThrift for ColumnCryptoMetaData { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + match self { + Self::ENCRYPTION_WITH_FOOTER_KEY => { + writer.write_empty_struct(1, 0)?; + } + Self::ENCRYPTION_WITH_COLUMN_KEY(key) => { + key.write_thrift_field(writer, 2, 0)?; + } + } + writer.write_struct_end() + } +} + +impl WriteThriftField for ColumnCryptoMetaData { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + /// Converts Thrift definition into `ColumnCryptoMetadata`. pub fn try_from_thrift( thrift_column_crypto_metadata: &TColumnCryptoMetaData, diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 69cdf8f10714..8b06fe676308 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -125,7 +125,10 @@ use crate::{ }; use crate::{ basic::{ColumnOrder, Compression, Encoding, Type}, - parquet_thrift::{FieldType, ThriftCompactInputProtocol}, + parquet_thrift::{ + ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, + WriteThrift, WriteThriftField, + }, }; use crate::{ data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData, @@ -135,6 +138,7 @@ use crate::{ thrift_struct, }; pub use reader::{FooterTail, PageIndexPolicy, ParquetMetaDataReader}; +use std::io::Write; use std::ops::Range; use std::sync::Arc; pub use writer::ParquetMetaDataWriter; diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 3888d247df1c..bc3914112d0d 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -17,6 +17,7 @@ // a collection of generated structs used to parse thrift metadata +use std::io::Write; use std::sync::Arc; #[cfg(feature = "encryption")] @@ -33,7 +34,10 @@ use crate::{ page_encoding_stats::PageEncodingStats, statistics::ValueStatistics, }, - parquet_thrift::{FieldType, ThriftCompactInputProtocol}, + parquet_thrift::{ + ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, + WriteThrift, WriteThriftField, + }, schema::types::{parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor}, thrift_struct, util::bit_util::FromBytes, diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index 281954d939dd..2d433dc9b3f1 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -17,9 +17,14 @@ //! Per-page encoding information. +use std::io::Write; + use crate::basic::{Encoding, PageType}; use crate::errors::{ParquetError, Result}; -use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; +use crate::parquet_thrift::{ + ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, + WriteThriftField, +}; use crate::thrift_struct; // TODO: This should probably all be moved to thrift_gen diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 99e5963b290e..e9cf119224c9 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -26,8 +26,12 @@ use crate::file::page_index::column_index::{ }; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::ChunkReader; -use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol}; +use crate::parquet_thrift::{ + ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, + WriteThriftField, +}; use crate::thrift_struct; +use std::io::Write; use std::ops::Range; /// Computes the covering range of two optional ranges diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 8217fa7878c8..b2842a897ebf 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -44,7 +44,7 @@ pub struct PageLocation { } ); -impl WriteThrift for PageLocation { +/*impl WriteThrift for PageLocation { const ELEMENT_TYPE: ElementType = ElementType::Struct; #[allow(unused_assignments)] @@ -59,7 +59,7 @@ impl WriteThrift for PageLocation { .write_thrift_field(writer, 3, last_field_id)?; writer.write_struct_end() } -} +}*/ impl From<&crate::format::PageLocation> for PageLocation { fn from(value: &crate::format::PageLocation) -> Self { @@ -95,7 +95,7 @@ pub struct OffsetIndexMetaData { } ); -impl WriteThrift for OffsetIndexMetaData { +/*impl WriteThrift for OffsetIndexMetaData { const ELEMENT_TYPE: ElementType = ElementType::Struct; #[allow(unused_assignments)] @@ -116,7 +116,7 @@ impl WriteThrift for OffsetIndexMetaData { } writer.write_struct_end() } -} +}*/ impl OffsetIndexMetaData { /// Creates a new [`OffsetIndexMetaData`] from an [`OffsetIndex`]. diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 40aadad98fb1..c015a8685651 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -140,10 +140,8 @@ macro_rules! thrift_union_all_empty { fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { match *self { - $(Self::$field_name => writer.write_field_begin(FieldType::Struct, $field_id, 0)?,)* - } - // write end of struct for empty struct - writer.write_struct_end()?; + $(Self::$field_name => writer.write_empty_struct($field_id, 0)?,)* + }; // write end of struct for this union writer.write_struct_end() } @@ -266,9 +264,89 @@ macro_rules! thrift_struct { }) } } + + impl<$($lt,)? W: Write> WriteThrift for $identifier $(<$lt>)? { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + #[allow(unused_assignments)] + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let mut last_field_id = 0i16; + $($crate::__thrift_write_required_or_optional_field!($required_or_optional $field_name, $field_id, $field_type, self, writer, last_field_id);)* + Ok(()) + } + } + + impl<$($lt,)? W: Write> WriteThriftField for $identifier $(<$lt>)? { + fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } + } } } +#[doc(hidden)] +#[macro_export] +macro_rules! __thrift_write_required_or_optional_field { + (required $field_name:ident, $field_id:literal, $field_type:ident, $self:tt, $writer:tt, $last_id:tt) => { + $crate::__thrift_write_required_field!( + $field_type, + $field_name, + $field_id, + $self, + $writer, + $last_id + ) + }; + (optional $field_name:ident, $field_id:literal, $field_type:ident, $self:tt, $writer:tt, $last_id:tt) => { + $crate::__thrift_write_optional_field!( + $field_type, + $field_name, + $field_id, + $self, + $writer, + $last_id + ) + }; +} + +#[doc(hidden)] +#[macro_export] +macro_rules! __thrift_write_required_field { + (binary, $field_name:ident, $field_id:literal, $self:ident, $writer:ident, $last_id:ident) => { + $writer.write_field_begin(FieldType::Binary, $field_id, $last_id)?; + $writer.write_bytes($self.$field_name)?; + $last_id = $field_id; + }; + ($field_type:ident, $field_name:ident, $field_id:literal, $self:ident, $writer:ident, $last_id:ident) => { + $last_id = $self + .$field_name + .write_thrift_field($writer, $field_id, $last_id)?; + }; +} + +#[doc(hidden)] +#[macro_export] +macro_rules! __thrift_write_optional_field { + (binary, $field_name:ident, $field_id:literal, $self:ident, $writer:tt, $last_id:tt) => { + if $self.$field_name.is_some() { + $writer.write_field_begin(FieldType::Binary, $field_id, $last_id)?; + $writer.write_bytes($self.$field_name.as_ref().unwrap())?; + $last_id = $field_id; + } + }; + ($field_type:ident, $field_name:ident, $field_id:literal, $self:ident, $writer:tt, $last_id:tt) => { + if $self.$field_name.is_some() { + $last_id = $self + .$field_name + .as_ref() + .unwrap() + .write_thrift_field($writer, $field_id, $last_id)?; + } + }; +} + /// macro to use when decoding struct fields #[macro_export] macro_rules! thrift_read_field { diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index c8ff863c15a4..b2cb7bf54597 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -609,6 +609,12 @@ impl ThriftCompactOutputProtocol { Ok(()) } + pub(crate) fn write_empty_struct(&mut self, field_id: i16, last_field_id: i16) -> Result { + self.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_struct_end()?; + Ok(last_field_id) + } + pub(crate) fn write_bool(&mut self, val: bool) -> Result<()> { match val { true => self.write_byte(1), @@ -631,6 +637,11 @@ impl ThriftCompactOutputProtocol { pub(crate) fn write_i64(&mut self, val: i64) -> Result<()> { self.write_zig_zag(val as _) } + + pub(crate) fn write_double(&mut self, val: f64) -> Result<()> { + self.writer.write_all(&val.to_le_bytes())?; + Ok(()) + } } pub(crate) trait WriteThrift { @@ -695,6 +706,38 @@ impl WriteThrift for i64 { } } +impl WriteThrift for OrderedF64 { + const ELEMENT_TYPE: ElementType = ElementType::Double; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_double(self.0) + } +} + +impl WriteThrift for &[u8] { + const ELEMENT_TYPE: ElementType = ElementType::Binary; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_bytes(self) + } +} + +impl WriteThrift for &str { + const ELEMENT_TYPE: ElementType = ElementType::Binary; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_bytes(self.as_bytes()) + } +} + +impl WriteThrift for String { + const ELEMENT_TYPE: ElementType = ElementType::Binary; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + writer.write_bytes(self.as_bytes()) + } +} + pub(crate) trait WriteThriftField { // used to write struct fields (which may be basic types or generated types). // write the field header and field value. returns `field_id`. @@ -774,6 +817,32 @@ impl WriteThriftField for i64 { } } +impl WriteThriftField for OrderedF64 { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Double, field_id, last_field_id)?; + writer.write_double(self.0)?; + Ok(field_id) + } +} + +impl WriteThriftField for &[u8] { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Binary, field_id, last_field_id)?; + writer.write_bytes(self)?; + Ok(field_id) + } +} + impl WriteThriftField for &str { fn write_thrift_field( &self, @@ -787,6 +856,19 @@ impl WriteThriftField for &str { } } +impl WriteThriftField for String { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::Binary, field_id, last_field_id)?; + writer.write_bytes(self.as_bytes())?; + Ok(field_id) + } +} + impl WriteThriftField for Vec where T: WriteThrift, diff --git a/parquet/tests/arrow_reader/io/mod.rs b/parquet/tests/arrow_reader/io/mod.rs index bfdb9467e20c..051a61de5075 100644 --- a/parquet/tests/arrow_reader/io/mod.rs +++ b/parquet/tests/arrow_reader/io/mod.rs @@ -286,11 +286,7 @@ impl TestRowGroups { .enumerate() .map(|(col_idx, col_meta)| { let column_name = col_meta.column_descr().name().to_string(); - let page_locations = offset_index[rg_index][col_idx] - .page_locations() - .iter() - .map(parquet::format::PageLocation::from) - .collect(); + let page_locations = offset_index[rg_index][col_idx].page_locations(); let dictionary_page_location = col_meta.dictionary_page_offset(); // We can find the byte range of the entire column chunk From 9f01b6076782a8f98f532291d10c1d3a7e0fcbd1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 17:21:17 -0700 Subject: [PATCH 026/126] fix bug in struct macro --- parquet/src/file/page_index/offset_index.rs | 40 --------------------- parquet/src/parquet_macros.rs | 2 +- 2 files changed, 1 insertion(+), 41 deletions(-) diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index b2842a897ebf..ac2620af09d8 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -44,23 +44,6 @@ pub struct PageLocation { } ); -/*impl WriteThrift for PageLocation { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - #[allow(unused_assignments)] - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - let mut last_field_id = 0i16; - last_field_id = self.offset.write_thrift_field(writer, 1, last_field_id)?; - last_field_id = self - .compressed_page_size - .write_thrift_field(writer, 2, last_field_id)?; - last_field_id = self - .first_row_index - .write_thrift_field(writer, 3, last_field_id)?; - writer.write_struct_end() - } -}*/ - impl From<&crate::format::PageLocation> for PageLocation { fn from(value: &crate::format::PageLocation) -> Self { Self { @@ -95,29 +78,6 @@ pub struct OffsetIndexMetaData { } ); -/*impl WriteThrift for OffsetIndexMetaData { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - #[allow(unused_assignments)] - fn write_thrift( - &self, - writer: &mut crate::parquet_thrift::ThriftCompactOutputProtocol, - ) -> Result<()> { - let mut last_field_id = 0i16; - last_field_id = self - .page_locations - .write_thrift_field(writer, 1, last_field_id)?; - if self.unencoded_byte_array_data_bytes.is_some() { - last_field_id = self - .unencoded_byte_array_data_bytes - .as_ref() - .unwrap() - .write_thrift_field(writer, 2, last_field_id)?; - } - writer.write_struct_end() - } -}*/ - impl OffsetIndexMetaData { /// Creates a new [`OffsetIndexMetaData`] from an [`OffsetIndex`]. /// diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index c015a8685651..5573c5812946 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -272,7 +272,7 @@ macro_rules! thrift_struct { fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; $($crate::__thrift_write_required_or_optional_field!($required_or_optional $field_name, $field_id, $field_type, self, writer, last_field_id);)* - Ok(()) + writer.write_struct_end() } } From 2511f8fe0f8a4d598fe2ec5f519aa31447b8859d Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 18:19:40 -0700 Subject: [PATCH 027/126] make Repetition public --- parquet/src/basic.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 3d774861c2a8..cf451b961f69 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -605,7 +605,8 @@ enum FieldRepetitionType { } ); -pub(crate) type Repetition = FieldRepetitionType; +/// Type alias for thrift `FieldRepetitionType` +pub type Repetition = FieldRepetitionType; // ---------------------------------------------------------------------- // Mirrors thrift enum `crate::format::Encoding` From 61e9e07655e16cd34e39ab3226b37fbaa2c61e10 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 22:32:36 -0700 Subject: [PATCH 028/126] get union working for writes --- parquet/src/file/column_crypto_metadata.rs | 51 +++++++++------------- parquet/src/parquet_macros.rs | 50 +++++++++++++++++---- 2 files changed, 63 insertions(+), 38 deletions(-) diff --git a/parquet/src/file/column_crypto_metadata.rs b/parquet/src/file/column_crypto_metadata.rs index ec2e91beaa54..5bba07357947 100644 --- a/parquet/src/file/column_crypto_metadata.rs +++ b/parquet/src/file/column_crypto_metadata.rs @@ -53,36 +53,6 @@ union ColumnCryptoMetaData { } ); -// TODO: need to get this into the thrift_union macro -impl WriteThrift for ColumnCryptoMetaData { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - match self { - Self::ENCRYPTION_WITH_FOOTER_KEY => { - writer.write_empty_struct(1, 0)?; - } - Self::ENCRYPTION_WITH_COLUMN_KEY(key) => { - key.write_thrift_field(writer, 2, 0)?; - } - } - writer.write_struct_end() - } -} - -impl WriteThriftField for ColumnCryptoMetaData { - fn write_thrift_field( - &self, - writer: &mut ThriftCompactOutputProtocol, - field_id: i16, - last_field_id: i16, - ) -> Result { - writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; - self.write_thrift(writer)?; - Ok(field_id) - } -} - /// Converts Thrift definition into `ColumnCryptoMetadata`. pub fn try_from_thrift( thrift_column_crypto_metadata: &TColumnCryptoMetaData, @@ -119,6 +89,7 @@ pub fn to_thrift(column_crypto_metadata: &ColumnCryptoMetaData) -> TColumnCrypto #[cfg(test)] mod tests { use super::*; + use crate::parquet_thrift::tests::test_roundtrip; #[test] fn test_encryption_with_footer_key_from_thrift() { @@ -136,4 +107,24 @@ mod tests { assert_eq!(try_from_thrift(&to_thrift(&metadata)).unwrap(), metadata); } + + #[test] + fn test_column_crypto_roundtrip() { + test_roundtrip(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY); + + let path_in_schema = vec!["foo".to_owned(), "bar".to_owned(), "really".to_owned()]; + let key_metadata = vec![1u8; 32]; + test_roundtrip(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY( + EncryptionWithColumnKey { + path_in_schema: path_in_schema.clone(), + key_metadata: None, + }, + )); + test_roundtrip(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY( + EncryptionWithColumnKey { + path_in_schema, + key_metadata: Some(key_metadata), + }, + )); + } } diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 5573c5812946..41a5bf3b43f9 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -219,9 +219,51 @@ macro_rules! thrift_union { Ok(ret) } } + + impl WriteThrift for $identifier { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + match self { + $($crate::__thrift_write_variant_lhs!($field_name $($field_type)?, variant_val) => + $crate::__thrift_write_variant_rhs!($field_id $($field_type)?, writer, variant_val),)* + }; + writer.write_struct_end() + } + } + + impl WriteThriftField for $identifier { + fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } + } } } +#[doc(hidden)] +#[macro_export] +macro_rules! __thrift_write_variant_lhs { + ($field_name:ident $field_type:ident, $val:tt) => { + Self::$field_name($val) + }; + ($field_name:ident, $val:tt) => { + Self::$field_name + }; +} + +#[doc(hidden)] +#[macro_export] +macro_rules! __thrift_write_variant_rhs { + ($field_id:literal $field_type:ident, $writer:tt, $val:ident) => { + $val.write_thrift_field($writer, $field_id, 0)? + }; + ($field_id:literal, $writer:tt, $val:tt) => { + $writer.write_empty_struct($field_id, 0)? + }; +} + /// macro to generate rust structs from a thrift struct definition /// unlike enum and union, this macro will allow for visibility specifier /// can also take optional lifetime for struct and elements within it (need e.g.) @@ -347,14 +389,6 @@ macro_rules! __thrift_write_optional_field { }; } -/// macro to use when decoding struct fields -#[macro_export] -macro_rules! thrift_read_field { - ($field_name:ident, $prot:tt, $field_type:ident) => { - $field_name = Some($crate::__thrift_read_field!($prot, $field_type)); - }; -} - #[doc(hidden)] #[macro_export] macro_rules! __thrift_required_or_optional { From e39f119d30fca1ec7b87acfc56a1aa5c4ef41626 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 22:49:34 -0700 Subject: [PATCH 029/126] add some tests --- parquet/src/file/metadata/thrift_gen.rs | 42 +++++++++++++++++++++++++ parquet/src/parquet_thrift.rs | 2 +- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index bc3914112d0d..161f792084f7 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -511,3 +511,45 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData { Ok(ParquetMetaData::new(fmd, row_groups)) } } + +#[cfg(test)] +mod tests { + use crate::file::metadata::thrift_gen::BoundingBox; + use crate::parquet_thrift::{tests::test_roundtrip, OrderedF64}; + + #[test] + fn test_bounding_box_roundtrip() { + test_roundtrip(BoundingBox { + xmin: OrderedF64(0.1), + xmax: OrderedF64(10.3), + ymin: OrderedF64(0.001), + ymax: OrderedF64(128.5), + zmin: None, + zmax: None, + mmin: None, + mmax: None, + }); + + test_roundtrip(BoundingBox { + xmin: OrderedF64(0.1), + xmax: OrderedF64(10.3), + ymin: OrderedF64(0.001), + ymax: OrderedF64(128.5), + zmin: Some(OrderedF64(11.0)), + zmax: Some(OrderedF64(1300.0)), + mmin: None, + mmax: None, + }); + + test_roundtrip(BoundingBox { + xmin: OrderedF64(0.1), + xmax: OrderedF64(10.3), + ymin: OrderedF64(0.001), + ymax: OrderedF64(128.5), + zmin: Some(OrderedF64(11.0)), + zmax: Some(OrderedF64(1300.0)), + mmin: Some(OrderedF64(3.14)), + mmax: Some(OrderedF64(42.0)), + }); + } +} diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index b2cb7bf54597..935965b64abd 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -29,7 +29,7 @@ use crate::errors::{ParquetError, Result}; // wrappers out there that should probably be used instead. // thrift seems to re-export an impl from ordered-float #[derive(Debug, Clone, Copy, PartialEq)] -pub struct OrderedF64(f64); +pub struct OrderedF64(pub f64); impl From for f64 { fn from(value: OrderedF64) -> Self { From def3d07fa516bfadbf5d7bd35ffa6a4c0b427994 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 22:55:55 -0700 Subject: [PATCH 030/126] redo OrderedF64 initialization --- parquet/src/file/metadata/thrift_gen.rs | 36 ++++++++++++------------- parquet/src/parquet_thrift.rs | 8 +++++- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 161f792084f7..c553d8f5f572 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -520,10 +520,10 @@ mod tests { #[test] fn test_bounding_box_roundtrip() { test_roundtrip(BoundingBox { - xmin: OrderedF64(0.1), - xmax: OrderedF64(10.3), - ymin: OrderedF64(0.001), - ymax: OrderedF64(128.5), + xmin: 0.1.into(), + xmax: 10.3.into(), + ymin: 0.001.into(), + ymax: 128.5.into(), zmin: None, zmax: None, mmin: None, @@ -531,25 +531,25 @@ mod tests { }); test_roundtrip(BoundingBox { - xmin: OrderedF64(0.1), - xmax: OrderedF64(10.3), - ymin: OrderedF64(0.001), - ymax: OrderedF64(128.5), - zmin: Some(OrderedF64(11.0)), - zmax: Some(OrderedF64(1300.0)), + xmin: 0.1.into(), + xmax: 10.3.into(), + ymin: 0.001.into(), + ymax: 128.5.into(), + zmin: Some(11.0.into()), + zmax: Some(1300.0.into()), mmin: None, mmax: None, }); test_roundtrip(BoundingBox { - xmin: OrderedF64(0.1), - xmax: OrderedF64(10.3), - ymin: OrderedF64(0.001), - ymax: OrderedF64(128.5), - zmin: Some(OrderedF64(11.0)), - zmax: Some(OrderedF64(1300.0)), - mmin: Some(OrderedF64(3.14)), - mmax: Some(OrderedF64(42.0)), + xmin: 0.1.into(), + xmax: 10.3.into(), + ymin: 0.001.into(), + ymax: 128.5.into(), + zmin: Some(11.0.into()), + zmax: Some(1300.0.into()), + mmin: Some(3.14.into()), + mmax: Some(42.0.into()), }); } } diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 935965b64abd..4f04d990860e 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -29,7 +29,13 @@ use crate::errors::{ParquetError, Result}; // wrappers out there that should probably be used instead. // thrift seems to re-export an impl from ordered-float #[derive(Debug, Clone, Copy, PartialEq)] -pub struct OrderedF64(pub f64); +pub struct OrderedF64(f64); + +impl From for OrderedF64 { + fn from(value: f64) -> Self { + Self(value) + } +} impl From for f64 { fn from(value: OrderedF64) -> Self { From 386f222f79fdee635c1f696942fca6c969d2365b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 25 Aug 2025 22:57:09 -0700 Subject: [PATCH 031/126] unused import --- parquet/src/file/metadata/thrift_gen.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index c553d8f5f572..1dc829e5cfe2 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -515,7 +515,7 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData { #[cfg(test)] mod tests { use crate::file::metadata::thrift_gen::BoundingBox; - use crate::parquet_thrift::{tests::test_roundtrip, OrderedF64}; + use crate::parquet_thrift::tests::test_roundtrip; #[test] fn test_bounding_box_roundtrip() { From 6beb79d4fbe11fd570c4144c5bd592f434c7785d Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 26 Aug 2025 12:12:21 -0700 Subject: [PATCH 032/126] get decryption working --- parquet/src/file/metadata/mod.rs | 94 +--------- parquet/src/file/metadata/reader.rs | 217 +++------------------- parquet/src/file/metadata/thrift_gen.rs | 234 +++++++++++++++++++++++- parquet/src/file/serialized_reader.rs | 2 +- parquet/src/parquet_macros.rs | 15 +- parquet/tests/arrow_reader/bad_data.rs | 2 +- 6 files changed, 268 insertions(+), 296 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 8b06fe676308..0c4372e38683 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -97,10 +97,7 @@ pub(crate) mod thrift_gen; mod writer; #[cfg(feature = "encryption")] -use crate::encryption::{ - decrypt::FileDecryptor, - modules::{create_module_aad, ModuleType}, -}; +use crate::encryption::decrypt::FileDecryptor; #[cfg(feature = "encryption")] use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData}; pub(crate) use crate::file::metadata::memory::HeapSize; @@ -117,8 +114,6 @@ use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, Type as SchemaType, }; -#[cfg(feature = "encryption")] -use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use crate::{ basic::BoundaryOrder, errors::{ParquetError, Result}, @@ -684,93 +679,6 @@ impl RowGroupMetaData { self.file_offset } - /// Method to convert from encrypted Thrift. - #[cfg(feature = "encryption")] - fn from_encrypted_thrift( - schema_descr: SchemaDescPtr, - mut rg: crate::format::RowGroup, - decryptor: Option<&FileDecryptor>, - ) -> Result { - if schema_descr.num_columns() != rg.columns.len() { - return Err(general_err!( - "Column count mismatch. Schema has {} columns while Row Group has {}", - schema_descr.num_columns(), - rg.columns.len() - )); - } - let total_byte_size = rg.total_byte_size; - let num_rows = rg.num_rows; - let mut columns = vec![]; - - for (i, (mut c, d)) in rg - .columns - .drain(0..) - .zip(schema_descr.columns()) - .enumerate() - { - // Read encrypted metadata if it's present and we have a decryptor. - if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) { - let column_decryptor = match c.crypto_metadata.as_ref() { - None => { - return Err(general_err!( - "No crypto_metadata is set for column '{}', which has encrypted metadata", - d.path().string() - )); - } - Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => { - let column_name = crypto_metadata.path_in_schema.join("."); - decryptor.get_column_metadata_decryptor( - column_name.as_str(), - crypto_metadata.key_metadata.as_deref(), - )? - } - Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => { - decryptor.get_footer_decryptor()? - } - }; - - let column_aad = create_module_aad( - decryptor.file_aad(), - ModuleType::ColumnMetaData, - rg.ordinal.unwrap() as usize, - i, - None, - )?; - - let buf = c.encrypted_column_metadata.clone().unwrap(); - let decrypted_cc_buf = column_decryptor - .decrypt(buf.as_slice(), column_aad.as_ref()) - .map_err(|_| { - general_err!( - "Unable to decrypt column '{}', perhaps the column key is wrong?", - d.path().string() - ) - })?; - - let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice()); - c.meta_data = Some(crate::format::ColumnMetaData::read_from_in_protocol( - &mut prot, - )?); - } - columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?); - } - - let sorting_columns = rg.sorting_columns.map(|scs| { - scs.iter() - .map(|sc| sc.into()) - .collect::>() - }); - Ok(RowGroupMetaData { - columns, - num_rows, - sorting_columns, - total_byte_size, - schema_descr, - file_offset: rg.file_offset, - ordinal: rg.ordinal, - }) - } - /// Method to convert from Thrift. pub fn from_thrift( schema_descr: SchemaDescPtr, diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 57cc7c57ac66..ddccf39703bc 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -15,32 +15,25 @@ // specific language governing permissions and limitations // under the License. -use std::{io::Read, ops::Range, sync::Arc}; +use std::{io::Read, ops::Range}; -use crate::{ - basic::ColumnOrder, - file::metadata::{FileMetaData, KeyValue}, - parquet_thrift::ThriftCompactInputProtocol, -}; +use crate::parquet_thrift::ThriftCompactInputProtocol; #[cfg(feature = "encryption")] use crate::{ encryption::{ decrypt::{CryptoContext, FileDecryptionProperties, FileDecryptor}, modules::create_footer_aad, }, - format::{EncryptionAlgorithm, FileCryptoMetaData as TFileCryptoMetaData}, + file::metadata::thrift_gen::EncryptionAlgorithm, }; use bytes::Bytes; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData, RowGroupMetaData}; +use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData}; use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index}; use crate::file::reader::ChunkReader; use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER}; -use crate::schema::types; -use crate::schema::types::SchemaDescriptor; -use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; #[cfg(all(feature = "async", feature = "arrow"))] use crate::arrow::async_reader::{MetadataFetch, MetadataSuffixFetch}; @@ -960,17 +953,21 @@ impl ParquetMetaDataReader { encrypted_footer: bool, file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { - let mut prot = TCompactSliceInputProtocol::new(buf); + use crate::file::metadata::thrift_gen::parquet_metadata_with_encryption; + + let mut prot = ThriftCompactInputProtocol::new(buf); let mut file_decryptor = None; let decrypted_fmd_buf; if encrypted_footer { if let Some(file_decryption_properties) = file_decryption_properties { - let t_file_crypto_metadata: TFileCryptoMetaData = - TFileCryptoMetaData::read_from_in_protocol(&mut prot) + use crate::file::metadata::thrift_gen::{EncryptionAlgorithm, FileCryptoMetaData}; + + let t_file_crypto_metadata: FileCryptoMetaData = + FileCryptoMetaData::try_from(&mut prot) .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; let supply_aad_prefix = match &t_file_crypto_metadata.encryption_algorithm { - EncryptionAlgorithm::AESGCMV1(algo) => algo.supply_aad_prefix, + EncryptionAlgorithm::AES_GCM_V1(algo) => algo.supply_aad_prefix, _ => Some(false), } .unwrap_or(false); @@ -995,7 +992,7 @@ impl ParquetMetaDataReader { "Provided footer key and AAD were unable to decrypt parquet footer" ) })?; - prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); + prot = ThriftCompactInputProtocol::new(decrypted_fmd_buf.as_ref()); file_decryptor = Some(decryptor); } else { @@ -1003,58 +1000,13 @@ impl ParquetMetaDataReader { } } - let t_file_metadata = crate::format::FileMetaData::read_from_in_protocol(&mut prot) - .map_err(|e| general_err!("Could not parse metadata: {}", e))?; - let schema = types::from_thrift(&t_file_metadata.schema)?; - let schema_descr = Arc::new(SchemaDescriptor::new(schema)); - - if let (Some(algo), Some(file_decryption_properties)) = ( - t_file_metadata.encryption_algorithm, + parquet_metadata_with_encryption( + &mut prot, + file_decryptor, file_decryption_properties, - ) { - // File has a plaintext footer but encryption algorithm is set - let file_decryptor_value = get_file_decryptor( - algo, - t_file_metadata.footer_signing_key_metadata.as_deref(), - file_decryption_properties, - )?; - if file_decryption_properties.check_plaintext_footer_integrity() && !encrypted_footer { - file_decryptor_value.verify_plaintext_footer_signature(buf)?; - } - file_decryptor = Some(file_decryptor_value); - } - - let mut row_groups = Vec::new(); - for rg in t_file_metadata.row_groups { - let r = RowGroupMetaData::from_encrypted_thrift( - schema_descr.clone(), - rg, - file_decryptor.as_ref(), - )?; - row_groups.push(r); - } - let column_orders = - Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; - - let key_value_metadata = t_file_metadata.key_value_metadata.map(|vkv| { - vkv.into_iter() - .map(|kv| KeyValue::new(kv.key, kv.value)) - .collect::>() - }); - - let file_metadata = FileMetaData::new( - t_file_metadata.version, - t_file_metadata.num_rows, - t_file_metadata.created_by, - key_value_metadata, - schema_descr, - column_orders, - ); - let mut metadata = ParquetMetaData::new(file_metadata, row_groups); - - metadata.with_file_decryptor(file_decryptor); - - Ok(metadata) + encrypted_footer, + buf, + ) } /// Decodes [`ParquetMetaData`] from the provided bytes. @@ -1065,36 +1017,8 @@ impl ParquetMetaDataReader { /// /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata pub fn decode_metadata(buf: &[u8]) -> Result { - let mut prot = TCompactSliceInputProtocol::new(buf); - - let t_file_metadata = crate::format::FileMetaData::read_from_in_protocol(&mut prot) - .map_err(|e| general_err!("Could not parse metadata: {}", e))?; - let schema = types::from_thrift(&t_file_metadata.schema)?; - let schema_descr = Arc::new(SchemaDescriptor::new(schema)); - - let mut row_groups = Vec::new(); - for rg in t_file_metadata.row_groups { - row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?); - } - let column_orders = - Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; - - let key_value_metadata = t_file_metadata.key_value_metadata.map(|vkv| { - vkv.into_iter() - .map(|kv| KeyValue::new(kv.key, kv.value)) - .collect::>() - }); - - let file_metadata = FileMetaData::new( - t_file_metadata.version, - t_file_metadata.num_rows, - t_file_metadata.created_by, - key_value_metadata, - schema_descr, - column_orders, - ); - - Ok(ParquetMetaData::new(file_metadata, row_groups)) + let mut prot = ThriftCompactInputProtocol::new(buf); + ParquetMetaData::try_from(&mut prot) } /// create meta data from thrift encoded bytes @@ -1102,55 +1026,25 @@ impl ParquetMetaDataReader { let mut prot = ThriftCompactInputProtocol::new(buf); ParquetMetaData::try_from(&mut prot) } - - /// Parses column orders from Thrift definition. - /// If no column orders are defined, returns `None`. - fn parse_column_orders( - t_column_orders: Option>, - schema_descr: &SchemaDescriptor, - ) -> Result>> { - match t_column_orders { - Some(orders) => { - // Should always be the case - if orders.len() != schema_descr.num_columns() { - return Err(general_err!("Column order length mismatch")); - }; - let mut res = Vec::new(); - for (i, column) in schema_descr.columns().iter().enumerate() { - match orders[i] { - crate::format::ColumnOrder::TYPEORDER(_) => { - let sort_order = ColumnOrder::get_sort_order( - column.logical_type(), - column.converted_type(), - column.physical_type(), - ); - res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order)); - } - } - } - Ok(Some(res)) - } - None => Ok(None), - } - } } #[cfg(feature = "encryption")] -fn get_file_decryptor( +pub(super) fn get_file_decryptor( encryption_algorithm: EncryptionAlgorithm, footer_key_metadata: Option<&[u8]>, file_decryption_properties: &FileDecryptionProperties, ) -> Result { match encryption_algorithm { - EncryptionAlgorithm::AESGCMV1(algo) => { + EncryptionAlgorithm::AES_GCM_V1(algo) => { let aad_file_unique = algo .aad_file_unique .ok_or_else(|| general_err!("AAD unique file identifier is not set"))?; let aad_prefix = if let Some(aad_prefix) = file_decryption_properties.aad_prefix() { aad_prefix.clone() } else { - algo.aad_prefix.unwrap_or_default() + algo.aad_prefix.map(|v| v.to_vec()).unwrap_or_default() }; + let aad_file_unique = aad_file_unique.to_vec(); FileDecryptor::new( file_decryption_properties, @@ -1159,7 +1053,7 @@ fn get_file_decryptor( aad_prefix, ) } - EncryptionAlgorithm::AESGCMCTRV1(_) => Err(nyi_err!( + EncryptionAlgorithm::AES_GCM_CTR_V1(_) => Err(nyi_err!( "The AES_GCM_CTR_V1 encryption algorithm is not yet supported" )), } @@ -1171,10 +1065,7 @@ mod tests { use bytes::Bytes; use zstd::zstd_safe::WriteBuf; - use crate::basic::SortOrder; - use crate::basic::Type; use crate::file::reader::Length; - use crate::schema::types::Type as SchemaType; use crate::util::test_common::file_util::get_test_file; #[test] @@ -1205,61 +1096,6 @@ mod tests { assert!(matches!(err, ParquetError::NeedMoreData(263))); } - #[test] - fn test_metadata_column_orders_parse() { - // Define simple schema, we do not need to provide logical types. - let fields = vec![ - Arc::new( - SchemaType::primitive_type_builder("col1", Type::INT32) - .build() - .unwrap(), - ), - Arc::new( - SchemaType::primitive_type_builder("col2", Type::FLOAT) - .build() - .unwrap(), - ), - ]; - let schema = SchemaType::group_type_builder("schema") - .with_fields(fields) - .build() - .unwrap(); - let schema_descr = SchemaDescriptor::new(Arc::new(schema)); - - let t_column_orders = Some(vec![ - crate::format::ColumnOrder::TYPEORDER(Default::default()), - crate::format::ColumnOrder::TYPEORDER(Default::default()), - ]); - - assert_eq!( - ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr).unwrap(), - Some(vec![ - ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), - ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED) - ]) - ); - - // Test when no column orders are defined. - assert_eq!( - ParquetMetaDataReader::parse_column_orders(None, &schema_descr).unwrap(), - None - ); - } - - #[test] - fn test_metadata_column_orders_len_mismatch() { - let schema = SchemaType::group_type_builder("schema").build().unwrap(); - let schema_descr = SchemaDescriptor::new(Arc::new(schema)); - - let t_column_orders = Some(vec![crate::format::ColumnOrder::TYPEORDER( - Default::default(), - )]); - - let res = ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr); - assert!(res.is_err()); - assert!(format!("{:?}", res.unwrap_err()).contains("Column order length mismatch")); - } - #[test] #[allow(deprecated)] fn test_try_parse() { @@ -1412,6 +1248,7 @@ mod async_tests { use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; use tempfile::NamedTempFile; use crate::arrow::ArrowWriter; diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 1dc829e5cfe2..60e9b5036916 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -20,8 +20,6 @@ use std::io::Write; use std::sync::Arc; -#[cfg(feature = "encryption")] -use crate::file::column_crypto_metadata::ColumnCryptoMetaData; use crate::{ basic::{ColumnOrder, Compression, ConvertedType, Encoding, LogicalType, Repetition, Type}, data_type::{ByteArray, FixedLenByteArray, Int96}, @@ -39,9 +37,15 @@ use crate::{ WriteThrift, WriteThriftField, }, schema::types::{parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor}, - thrift_struct, + thrift_struct, thrift_union, util::bit_util::FromBytes, }; +#[cfg(feature = "encryption")] +use crate::{ + encryption::decrypt::{FileDecryptionProperties, FileDecryptor}, + file::column_crypto_metadata::ColumnCryptoMetaData, + schema::types::SchemaDescPtr, +}; // this needs to be visible to the schema conversion code thrift_struct!( @@ -60,6 +64,56 @@ pub(crate) struct SchemaElement<'a> { } ); +thrift_struct!( +pub(crate) struct AesGcmV1<'a> { + /// AAD prefix + 1: optional binary<'a> aad_prefix + + /// Unique file identifier part of AAD suffix + 2: optional binary<'a> aad_file_unique + + /// In files encrypted with AAD prefix without storing it, + /// readers must supply the prefix + 3: optional bool supply_aad_prefix +} +); + +thrift_struct!( +pub(crate) struct AesGcmCtrV1<'a> { + /// AAD prefix + 1: optional binary<'a> aad_prefix + + /// Unique file identifier part of AAD suffix + 2: optional binary<'a> aad_file_unique + + /// In files encrypted with AAD prefix without storing it, + /// readers must supply the prefix + 3: optional bool supply_aad_prefix +} +); + +thrift_union!( +union EncryptionAlgorithm<'a> { + 1: (AesGcmV1<'a>) AES_GCM_V1 + 2: (AesGcmCtrV1<'a>) AES_GCM_CTR_V1 +} +); + +#[cfg(feature = "encryption")] +thrift_struct!( +/// Crypto metadata for files with encrypted footer +pub(crate) struct FileCryptoMetaData<'a> { + /// Encryption algorithm. This field is only used for files + /// with encrypted footer. Files with plaintext footer store algorithm id + /// inside footer (FileMetaData structure). + 1: required EncryptionAlgorithm<'a> encryption_algorithm + + /** Retrieval metadata of key used for encryption of footer, + * and (possibly) columns **/ + 2: optional binary<'a> key_metadata +} +); + // the following are only used internally so are private thrift_struct!( struct FileMetaData<'a> { @@ -71,8 +125,8 @@ struct FileMetaData<'a> { 5: optional list key_value_metadata 6: optional string created_by 7: optional list column_orders; - //8: optional EncryptionAlgorithm encryption_algorithm - //9: optional binary footer_signing_key_metadata + 8: optional EncryptionAlgorithm<'a> encryption_algorithm + 9: optional binary<'a> footer_signing_key_metadata } ); @@ -453,6 +507,176 @@ fn convert_stats( }) } +#[cfg(feature = "encryption")] +fn row_group_from_encrypted_thrift( + mut rg: RowGroup, + schema_descr: SchemaDescPtr, + decryptor: Option<&FileDecryptor>, +) -> Result { + if schema_descr.num_columns() != rg.columns.len() { + return Err(general_err!( + "Column count mismatch. Schema has {} columns while Row Group has {}", + schema_descr.num_columns(), + rg.columns.len() + )); + } + let total_byte_size = rg.total_byte_size; + let num_rows = rg.num_rows; + let mut columns = vec![]; + + for (i, (mut c, d)) in rg + .columns + .drain(0..) + .zip(schema_descr.columns()) + .enumerate() + { + // Read encrypted metadata if it's present and we have a decryptor. + if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) { + let column_decryptor = match c.crypto_metadata.as_ref() { + None => { + return Err(general_err!( + "No crypto_metadata is set for column '{}', which has encrypted metadata", + d.path().string() + )); + } + Some(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(crypto_metadata)) => { + let column_name = crypto_metadata.path_in_schema.join("."); + decryptor.get_column_metadata_decryptor( + column_name.as_str(), + crypto_metadata.key_metadata.as_deref(), + )? + } + Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY) => { + decryptor.get_footer_decryptor()? + } + }; + + let column_aad = crate::encryption::modules::create_module_aad( + decryptor.file_aad(), + crate::encryption::modules::ModuleType::ColumnMetaData, + rg.ordinal.unwrap() as usize, + i, + None, + )?; + + let buf = c.encrypted_column_metadata.unwrap(); + let decrypted_cc_buf = + column_decryptor + .decrypt(buf, column_aad.as_ref()) + .map_err(|_| { + general_err!( + "Unable to decrypt column '{}', perhaps the column key is wrong?", + d.path().string() + ) + })?; + + let mut prot = ThriftCompactInputProtocol::new(decrypted_cc_buf.as_slice()); + let col_meta = ColumnMetaData::try_from(&mut prot)?; + c.meta_data = Some(col_meta); + columns.push(convert_column(c, d.clone())?); + } else { + columns.push(convert_column(c, d.clone())?); + } + } + + let sorting_columns = rg.sorting_columns; + let file_offset = rg.file_offset; + let ordinal = rg.ordinal; + + Ok(RowGroupMetaData { + columns, + num_rows, + sorting_columns, + total_byte_size, + schema_descr, + file_offset, + ordinal, + }) +} + +#[cfg(feature = "encryption")] +pub(crate) fn parquet_metadata_with_encryption<'a>( + prot: &mut ThriftCompactInputProtocol<'a>, + mut file_decryptor: Option, + file_decryption_properties: Option<&FileDecryptionProperties>, + encrypted_footer: bool, + buf: &[u8], +) -> Result { + let file_meta = super::thrift_gen::FileMetaData::try_from(prot) + .map_err(|e| general_err!("Could not parse metadata: {}", e))?; + + let version = file_meta.version; + let num_rows = file_meta.num_rows; + let created_by = file_meta.created_by.map(|c| c.to_owned()); + let key_value_metadata = file_meta.key_value_metadata; + + let val = parquet_schema_from_array(file_meta.schema)?; + let schema_descr = Arc::new(SchemaDescriptor::new(val)); + + if let (Some(algo), Some(file_decryption_properties)) = + (file_meta.encryption_algorithm, file_decryption_properties) + { + // File has a plaintext footer but encryption algorithm is set + let file_decryptor_value = crate::file::metadata::reader::get_file_decryptor( + algo, + file_meta.footer_signing_key_metadata.as_deref(), + file_decryption_properties, + )?; + if file_decryption_properties.check_plaintext_footer_integrity() && !encrypted_footer { + file_decryptor_value.verify_plaintext_footer_signature(buf)?; + } + file_decryptor = Some(file_decryptor_value); + } + + // decrypt column chunk info + let mut row_groups = Vec::with_capacity(file_meta.row_groups.len()); + for rg in file_meta.row_groups { + let r = row_group_from_encrypted_thrift(rg, schema_descr.clone(), file_decryptor.as_ref())?; + row_groups.push(r); + } + + // need to map read column orders to actual values based on the schema + if file_meta + .column_orders + .as_ref() + .is_some_and(|cos| cos.len() != schema_descr.num_columns()) + { + return Err(general_err!("Column order length mismatch")); + } + + let column_orders = file_meta.column_orders.map(|cos| { + let mut res = Vec::with_capacity(cos.len()); + for (i, column) in schema_descr.columns().iter().enumerate() { + match cos[i] { + ColumnOrder::TYPE_DEFINED_ORDER(_) => { + let sort_order = ColumnOrder::get_sort_order( + column.logical_type(), + column.converted_type(), + column.physical_type(), + ); + res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order)); + } + _ => res.push(cos[i]), + } + } + res + }); + + let fmd = crate::file::metadata::FileMetaData::new( + version, + num_rows, + created_by, + key_value_metadata, + schema_descr, + column_orders, + ); + let mut metadata = ParquetMetaData::new(fmd, row_groups); + + metadata.with_file_decryptor(file_decryptor); + + Ok(metadata) +} + /// Create ParquetMetaData from thrift input. Note that this only decodes the file metadata in /// the Parquet footer. Page indexes will need to be added later. impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData { diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 5308825b0976..335f0bc3601b 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1877,7 +1877,7 @@ mod tests { let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data)); assert_eq!( ret.err().unwrap().to_string(), - "Parquet error: Could not parse metadata: bad data" + "Parquet error: Could not parse metadata: Parquet error: Received empty union from remote ColumnOrder" ); } diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 41a5bf3b43f9..bbce3918b74c 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -181,17 +181,17 @@ macro_rules! thrift_union_all_empty { #[macro_export] #[allow(clippy::crate_in_macro_def)] macro_rules! thrift_union { - ($(#[$($def_attrs:tt)*])* union $identifier:ident { $($(#[$($field_attrs:tt)*])* $field_id:literal : $( ( $field_type:ident $(< $element_type:ident >)? ) )? $field_name:ident $(;)?)* }) => { + ($(#[$($def_attrs:tt)*])* union $identifier:ident $(< $lt:lifetime >)? { $($(#[$($field_attrs:tt)*])* $field_id:literal : $( ( $field_type:ident $(< $element_type:ident >)? $(< $field_lt:lifetime >)?) )? $field_name:ident $(;)?)* }) => { $(#[cfg_attr(not(doctest), $($def_attrs)*)])* #[derive(Clone, Debug, Eq, PartialEq)] #[allow(non_camel_case_types)] #[allow(non_snake_case)] #[allow(missing_docs)] - pub enum $identifier { - $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name $( ( $crate::__thrift_union_type!{$field_type $($element_type)?} ) )?),* + pub enum $identifier $(<$lt>)? { + $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name $( ( $crate::__thrift_union_type!{$field_type $($field_lt)? $($element_type)?} ) )?),* } - impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier { + impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier $(<$lt>)? { type Error = ParquetError; fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { @@ -220,7 +220,7 @@ macro_rules! thrift_union { } } - impl WriteThrift for $identifier { + impl<$($lt,)? W: Write> WriteThrift for $identifier $(<$lt>)? { const ELEMENT_TYPE: ElementType = ElementType::Struct; fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { @@ -232,7 +232,7 @@ macro_rules! thrift_union { } } - impl WriteThriftField for $identifier { + impl<$($lt,)? W: Write> WriteThriftField for $identifier $(<$lt>)? { fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; self.write_thrift(writer)?; @@ -466,6 +466,9 @@ macro_rules! __thrift_field_type { #[doc(hidden)] #[macro_export] macro_rules! __thrift_union_type { + (binary $lt:lifetime) => { &$lt [u8] }; + (string $lt:lifetime) => { &$lt str }; + ($field_type:ident $lt:lifetime) => { $field_type<$lt> }; ($field_type:ident) => { $field_type }; (list $field_type:ident) => { Vec<$field_type> }; } diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index 619bbb862fe1..58e342ab39d1 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -82,7 +82,7 @@ fn test_parquet_1481() { let err = read_file("PARQUET-1481.parquet").unwrap_err(); assert_eq!( err.to_string(), - "Parquet error: Unexpected parquet Type: -7" + "Parquet error: Could not parse metadata: Parquet error: Unexpected Type -7" ); } From 1eaa17b1bd629748eeebd17319c295f2f71f6bb4 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 26 Aug 2025 12:53:42 -0700 Subject: [PATCH 033/126] refactor and clippy fixes --- parquet/benches/metadata.rs | 12 --- parquet/src/file/metadata/reader.rs | 120 +----------------------- parquet/src/file/metadata/thrift_gen.rs | 88 +++++++++++++++-- parquet/src/parquet_thrift.rs | 4 +- 4 files changed, 86 insertions(+), 138 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index 3c293462a157..151d928957ff 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -211,12 +211,6 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("decode parquet metadata new", |b| { - b.iter(|| { - ParquetMetaDataReader::decode_file_metadata(&meta_data).unwrap(); - }) - }); - let buf: Bytes = black_box(encoded_meta()).into(); c.bench_function("decode parquet metadata (wide)", |b| { b.iter(|| { @@ -230,12 +224,6 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("decode parquet metadata new (wide)", |b| { - b.iter(|| { - ParquetMetaDataReader::decode_file_metadata(&buf).unwrap(); - }) - }); - // rewrite file with page statistics. then read page headers. #[cfg(feature = "arrow")] let (file_bytes, metadata) = rewrite_file(data.clone()); diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index ddccf39703bc..7ab2db2f7ff3 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -17,15 +17,9 @@ use std::{io::Read, ops::Range}; -use crate::parquet_thrift::ThriftCompactInputProtocol; #[cfg(feature = "encryption")] -use crate::{ - encryption::{ - decrypt::{CryptoContext, FileDecryptionProperties, FileDecryptor}, - modules::create_footer_aad, - }, - file::metadata::thrift_gen::EncryptionAlgorithm, -}; +use crate::encryption::decrypt::{CryptoContext, FileDecryptionProperties}; +use crate::parquet_thrift::ThriftCompactInputProtocol; use bytes::Bytes; use crate::errors::{ParquetError, Result}; @@ -953,56 +947,7 @@ impl ParquetMetaDataReader { encrypted_footer: bool, file_decryption_properties: Option<&FileDecryptionProperties>, ) -> Result { - use crate::file::metadata::thrift_gen::parquet_metadata_with_encryption; - - let mut prot = ThriftCompactInputProtocol::new(buf); - let mut file_decryptor = None; - let decrypted_fmd_buf; - - if encrypted_footer { - if let Some(file_decryption_properties) = file_decryption_properties { - use crate::file::metadata::thrift_gen::{EncryptionAlgorithm, FileCryptoMetaData}; - - let t_file_crypto_metadata: FileCryptoMetaData = - FileCryptoMetaData::try_from(&mut prot) - .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; - let supply_aad_prefix = match &t_file_crypto_metadata.encryption_algorithm { - EncryptionAlgorithm::AES_GCM_V1(algo) => algo.supply_aad_prefix, - _ => Some(false), - } - .unwrap_or(false); - if supply_aad_prefix && file_decryption_properties.aad_prefix().is_none() { - return Err(general_err!( - "Parquet file was encrypted with an AAD prefix that is not stored in the file, \ - but no AAD prefix was provided in the file decryption properties" - )); - } - let decryptor = get_file_decryptor( - t_file_crypto_metadata.encryption_algorithm, - t_file_crypto_metadata.key_metadata.as_deref(), - file_decryption_properties, - )?; - let footer_decryptor = decryptor.get_footer_decryptor(); - let aad_footer = create_footer_aad(decryptor.file_aad())?; - - decrypted_fmd_buf = footer_decryptor? - .decrypt(prot.as_slice().as_ref(), aad_footer.as_ref()) - .map_err(|_| { - general_err!( - "Provided footer key and AAD were unable to decrypt parquet footer" - ) - })?; - prot = ThriftCompactInputProtocol::new(decrypted_fmd_buf.as_ref()); - - file_decryptor = Some(decryptor); - } else { - return Err(general_err!("Parquet file has an encrypted footer but decryption properties were not provided")); - } - } - - parquet_metadata_with_encryption( - &mut prot, - file_decryptor, + super::thrift_gen::parquet_metadata_with_encryption( file_decryption_properties, encrypted_footer, buf, @@ -1020,50 +965,12 @@ impl ParquetMetaDataReader { let mut prot = ThriftCompactInputProtocol::new(buf); ParquetMetaData::try_from(&mut prot) } - - /// create meta data from thrift encoded bytes - pub fn decode_file_metadata(buf: &[u8]) -> Result { - let mut prot = ThriftCompactInputProtocol::new(buf); - ParquetMetaData::try_from(&mut prot) - } -} - -#[cfg(feature = "encryption")] -pub(super) fn get_file_decryptor( - encryption_algorithm: EncryptionAlgorithm, - footer_key_metadata: Option<&[u8]>, - file_decryption_properties: &FileDecryptionProperties, -) -> Result { - match encryption_algorithm { - EncryptionAlgorithm::AES_GCM_V1(algo) => { - let aad_file_unique = algo - .aad_file_unique - .ok_or_else(|| general_err!("AAD unique file identifier is not set"))?; - let aad_prefix = if let Some(aad_prefix) = file_decryption_properties.aad_prefix() { - aad_prefix.clone() - } else { - algo.aad_prefix.map(|v| v.to_vec()).unwrap_or_default() - }; - let aad_file_unique = aad_file_unique.to_vec(); - - FileDecryptor::new( - file_decryption_properties, - footer_key_metadata, - aad_file_unique, - aad_prefix, - ) - } - EncryptionAlgorithm::AES_GCM_CTR_V1(_) => Err(nyi_err!( - "The AES_GCM_CTR_V1 encryption algorithm is not yet supported" - )), - } } #[cfg(test)] mod tests { use super::*; use bytes::Bytes; - use zstd::zstd_safe::WriteBuf; use crate::file::reader::Length; use crate::util::test_common::file_util::get_test_file; @@ -1210,27 +1117,6 @@ mod tests { "EOF: Parquet file too small. Size is 1728 but need 1729" ); } - - #[test] - fn test_new_decoder() { - let file = get_test_file("alltypes_tiny_pages.parquet"); - let len = file.len(); - - // read entire file - let bytes = file.get_bytes(0, len as usize).unwrap(); - let mut footer = [0u8; FOOTER_SIZE]; - footer.copy_from_slice(bytes.slice(len as usize - FOOTER_SIZE..).as_slice()); - let tail = ParquetMetaDataReader::decode_footer_tail(&footer).unwrap(); - let meta_len = tail.metadata_length(); - let metadata_bytes = bytes.slice(len as usize - FOOTER_SIZE - meta_len..); - - // get ParquetMetaData - let m = ParquetMetaDataReader::decode_file_metadata(&metadata_bytes).unwrap(); - let m2 = ParquetMetaDataReader::decode_metadata(&metadata_bytes).unwrap(); - - // check that metadatas are equivalent - assert_eq!(m, m2); - } } #[cfg(all(feature = "async", feature = "arrow", test))] diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 60e9b5036916..869bdbd20ac8 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -595,14 +595,57 @@ fn row_group_from_encrypted_thrift( } #[cfg(feature = "encryption")] -pub(crate) fn parquet_metadata_with_encryption<'a>( - prot: &mut ThriftCompactInputProtocol<'a>, - mut file_decryptor: Option, +pub(crate) fn parquet_metadata_with_encryption( file_decryption_properties: Option<&FileDecryptionProperties>, encrypted_footer: bool, buf: &[u8], ) -> Result { - let file_meta = super::thrift_gen::FileMetaData::try_from(prot) + let mut prot = ThriftCompactInputProtocol::new(buf); + let mut file_decryptor = None; + let decrypted_fmd_buf; + + if encrypted_footer { + if let Some(file_decryption_properties) = file_decryption_properties { + let t_file_crypto_metadata: FileCryptoMetaData = + FileCryptoMetaData::try_from(&mut prot) + .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; + let supply_aad_prefix = match &t_file_crypto_metadata.encryption_algorithm { + EncryptionAlgorithm::AES_GCM_V1(algo) => algo.supply_aad_prefix, + _ => Some(false), + } + .unwrap_or(false); + if supply_aad_prefix && file_decryption_properties.aad_prefix().is_none() { + return Err(general_err!( + "Parquet file was encrypted with an AAD prefix that is not stored in the file, \ + but no AAD prefix was provided in the file decryption properties" + )); + } + let decryptor = get_file_decryptor( + t_file_crypto_metadata.encryption_algorithm, + t_file_crypto_metadata.key_metadata, + file_decryption_properties, + )?; + let footer_decryptor = decryptor.get_footer_decryptor(); + let aad_footer = crate::encryption::modules::create_footer_aad(decryptor.file_aad())?; + + decrypted_fmd_buf = footer_decryptor? + .decrypt(prot.as_slice().as_ref(), aad_footer.as_ref()) + .map_err(|_| { + general_err!( + "Provided footer key and AAD were unable to decrypt parquet footer" + ) + })?; + prot = ThriftCompactInputProtocol::new(decrypted_fmd_buf.as_ref()); + + file_decryptor = Some(decryptor); + } else { + return Err(general_err!( + "Parquet file has an encrypted footer but decryption properties were not provided" + )); + } + } + + let file_meta = super::thrift_gen::FileMetaData::try_from(&mut prot) .map_err(|e| general_err!("Could not parse metadata: {}", e))?; let version = file_meta.version; @@ -617,9 +660,9 @@ pub(crate) fn parquet_metadata_with_encryption<'a>( (file_meta.encryption_algorithm, file_decryption_properties) { // File has a plaintext footer but encryption algorithm is set - let file_decryptor_value = crate::file::metadata::reader::get_file_decryptor( + let file_decryptor_value = get_file_decryptor( algo, - file_meta.footer_signing_key_metadata.as_deref(), + file_meta.footer_signing_key_metadata, file_decryption_properties, )?; if file_decryption_properties.check_plaintext_footer_integrity() && !encrypted_footer { @@ -677,6 +720,37 @@ pub(crate) fn parquet_metadata_with_encryption<'a>( Ok(metadata) } +#[cfg(feature = "encryption")] +pub(super) fn get_file_decryptor( + encryption_algorithm: EncryptionAlgorithm, + footer_key_metadata: Option<&[u8]>, + file_decryption_properties: &FileDecryptionProperties, +) -> Result { + match encryption_algorithm { + EncryptionAlgorithm::AES_GCM_V1(algo) => { + let aad_file_unique = algo + .aad_file_unique + .ok_or_else(|| general_err!("AAD unique file identifier is not set"))?; + let aad_prefix = if let Some(aad_prefix) = file_decryption_properties.aad_prefix() { + aad_prefix.clone() + } else { + algo.aad_prefix.map(|v| v.to_vec()).unwrap_or_default() + }; + let aad_file_unique = aad_file_unique.to_vec(); + + FileDecryptor::new( + file_decryption_properties, + footer_key_metadata, + aad_file_unique, + aad_prefix, + ) + } + EncryptionAlgorithm::AES_GCM_CTR_V1(_) => Err(nyi_err!( + "The AES_GCM_CTR_V1 encryption algorithm is not yet supported" + )), + } +} + /// Create ParquetMetaData from thrift input. Note that this only decodes the file metadata in /// the Parquet footer. Page indexes will need to be added later. impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData { @@ -772,7 +846,7 @@ mod tests { ymax: 128.5.into(), zmin: Some(11.0.into()), zmax: Some(1300.0.into()), - mmin: Some(3.14.into()), + mmin: Some(3.7.into()), mmax: Some(42.0.into()), }); } diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 4f04d990860e..ac5d72ecdd69 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -665,8 +665,8 @@ where fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_list_begin(T::ELEMENT_TYPE, self.len())?; - for i in 0..self.len() { - self[i].write_thrift(writer)?; + for item in self { + item.write_thrift(writer)?; } Ok(()) } From 713e38abb2ef6b2e406e96fd38d3845b0d2f9084 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 26 Aug 2025 14:49:55 -0700 Subject: [PATCH 034/126] add page header defs --- parquet/src/file/metadata/thrift_gen.rs | 104 +++++++++++++++++++++++- parquet/src/parquet_macros.rs | 1 + 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 869bdbd20ac8..630126ea8be8 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -21,7 +21,9 @@ use std::io::Write; use std::sync::Arc; use crate::{ - basic::{ColumnOrder, Compression, ConvertedType, Encoding, LogicalType, Repetition, Type}, + basic::{ + ColumnOrder, Compression, ConvertedType, Encoding, LogicalType, PageType, Repetition, Type, + }, data_type::{ByteArray, FixedLenByteArray, Int96}, errors::{ParquetError, Result}, file::{ @@ -64,6 +66,102 @@ pub(crate) struct SchemaElement<'a> { } ); +thrift_struct!( +pub(crate) struct DataPageHeader { + /// Number of values, including NULLs, in this data page. + /// + /// If a OffsetIndex is present, a page must begin at a row + /// boundary (repetition_level = 0). Otherwise, pages may begin + /// within a row (repetition_level > 0). + 1: required i32 num_values + + /// Encoding used for this data page + 2: required Encoding encoding + + /// Encoding used for definition levels + 3: required Encoding definition_level_encoding; + + /// Encoding used for repetition levels + 4: required Encoding repetition_level_encoding; + + // Optional statistics for the data in this page + // page stats are pretty useless...lets ignore them + //5: optional Statistics statistics; +} +); + +thrift_struct!( + pub(crate) struct IndexPageHeader {} +); + +thrift_struct!( +pub(crate) struct DictionaryPageHeader { + /// Number of values in the dictionary + 1: required i32 num_values; + + /// Encoding using this dictionary page + 2: required Encoding encoding + + /// If true, the entries in the dictionary are sorted in ascending order + 3: optional bool is_sorted; +} +); + +thrift_struct!( +pub(crate) struct DataPageHeaderV2 { + /// Number of values, including NULLs, in this data page. + 1: required i32 num_values + /// Number of NULL values, in this data page. + /// Number of non-null = num_values - num_nulls which is also the number of values in the data section + 2: required i32 num_nulls + /// Number of rows in this data page. Every page must begin at a + /// row boundary (repetition_level = 0): rows must **not** be + /// split across page boundaries when using V2 data pages. + 3: required i32 num_rows + /// Encoding used for data in this page + 4: required Encoding encoding + + // repetition levels and definition levels are always using RLE (without size in it) + + /// Length of the definition levels + 5: required i32 definition_levels_byte_length; + /// Length of the repetition levels + 6: required i32 repetition_levels_byte_length; + + /// Whether the values are compressed. + /// Which means the section of the page between + /// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) + /// is compressed with the compression_codec. + /// If missing it is considered compressed + 7: optional bool is_compressed = true; + + // Optional statistics for the data in this page + //8: optional Statistics statistics; +} +); + +thrift_struct!( +pub(crate) struct PageHeader { + /// the type of the page: indicates which of the *_header fields is set + 1: required PageType type_ + + /// Uncompressed page size in bytes (not including this header) + 2: required i32 uncompressed_page_size + + /// Compressed (and potentially encrypted) page size in bytes, not including this header + 3: required i32 compressed_page_size + + /// The 32-bit CRC checksum for the page, to be be calculated as follows: + 4: optional i32 crc + + // Headers for page specific data. One only will be set. + 5: optional DataPageHeader data_page_header; + 6: optional IndexPageHeader index_page_header; + 7: optional DictionaryPageHeader dictionary_page_header; + 8: optional DataPageHeaderV2 data_page_header_v2; +} +); + thrift_struct!( pub(crate) struct AesGcmV1<'a> { /// AAD prefix @@ -226,7 +324,7 @@ struct SizeStatistics { ); thrift_struct!( -struct Statistics<'a> { +pub(crate) struct Statistics<'a> { 1: optional binary<'a> max; 2: optional binary<'a> min; 3: optional i64 null_count; @@ -358,7 +456,7 @@ fn convert_column( Ok(result) } -fn convert_stats( +pub(crate) fn convert_stats( physical_type: Type, thrift_stats: Option, ) -> Result> { diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index bbce3918b74c..eba279f47c0e 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -312,6 +312,7 @@ macro_rules! thrift_struct { #[allow(unused_assignments)] fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + #[allow(unused_mut, unused_variables)] let mut last_field_id = 0i16; $($crate::__thrift_write_required_or_optional_field!($required_or_optional $field_name, $field_id, $field_type, self, writer, last_field_id);)* writer.write_struct_end() From 79e8f85cd9b7a584f74f6639d272767acaeaeaf1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 27 Aug 2025 08:47:50 -0700 Subject: [PATCH 035/126] totally rework the input side add ReadThrift trait and make ThriftCompactInputProtocol a trait --- parquet/src/basic.rs | 39 +- parquet/src/file/column_crypto_metadata.rs | 4 +- parquet/src/file/metadata/mod.rs | 4 +- parquet/src/file/metadata/reader.rs | 6 +- parquet/src/file/metadata/thrift_gen.rs | 25 +- parquet/src/file/page_encoding_stats.rs | 6 +- parquet/src/file/page_index/index_reader.rs | 14 +- parquet/src/file/page_index/offset_index.rs | 12 +- parquet/src/parquet_macros.rs | 46 +-- parquet/src/parquet_thrift.rs | 409 ++++++++++---------- 10 files changed, 279 insertions(+), 286 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index cf451b961f69..8cf6b5f85b8b 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -26,8 +26,8 @@ use std::{fmt, str}; pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel}; use crate::parquet_thrift::{ - ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, - WriteThriftField, + ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, + WriteThrift, WriteThriftField, }; use crate::{thrift_enum, thrift_struct, thrift_union_all_empty}; @@ -165,9 +165,8 @@ pub enum ConvertedType { INTERVAL, } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ConvertedType { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ConvertedType { + fn read_thrift(prot: &mut R) -> Result { let val = prot.read_i32()?; Ok(match val { 0 => Self::UTF8, @@ -361,9 +360,8 @@ pub enum LogicalType { }, } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType { + fn read_thrift(prot: &mut R) -> Result { prot.read_struct_begin()?; let field_ident = prot.read_field_begin()?; @@ -388,7 +386,7 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType { Self::Enum } 5 => { - let val = DecimalType::try_from(&mut *prot)?; + let val = DecimalType::read_thrift(&mut *prot)?; Self::Decimal { scale: val.scale, precision: val.precision, @@ -399,21 +397,21 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType { Self::Date } 7 => { - let val = TimeType::try_from(&mut *prot)?; + let val = TimeType::read_thrift(&mut *prot)?; Self::Time { is_adjusted_to_u_t_c: val.is_adjusted_to_u_t_c, unit: val.unit, } } 8 => { - let val = TimestampType::try_from(&mut *prot)?; + let val = TimestampType::read_thrift(&mut *prot)?; Self::Timestamp { is_adjusted_to_u_t_c: val.is_adjusted_to_u_t_c, unit: val.unit, } } 10 => { - let val = IntType::try_from(&mut *prot)?; + let val = IntType::read_thrift(&mut *prot)?; Self::Integer { is_signed: val.is_signed, bit_width: val.bit_width, @@ -440,19 +438,19 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType { Self::Float16 } 16 => { - let val = VariantType::try_from(&mut *prot)?; + let val = VariantType::read_thrift(&mut *prot)?; Self::Variant { specification_version: val.specification_version, } } 17 => { - let val = GeometryType::try_from(&mut *prot)?; + let val = GeometryType::read_thrift(&mut *prot)?; Self::Geometry { crs: val.crs.map(|s| s.to_owned()), } } 18 => { - let val = GeographyType::try_from(&mut *prot)?; + let val = GeographyType::read_thrift(&mut *prot)?; Self::Geography { crs: val.crs.map(|s| s.to_owned()), algorithm: val.algorithm, @@ -756,9 +754,8 @@ pub enum Compression { LZ4_RAW, } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Compression { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for Compression { + fn read_thrift(prot: &mut R) -> Result { let val = prot.read_i32()?; Ok(match val { 0 => Self::UNCOMPRESSED, @@ -1123,10 +1120,8 @@ impl ColumnOrder { } } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ColumnOrder { - type Error = ParquetError; - - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ColumnOrder { + fn read_thrift(prot: &mut R) -> Result { prot.read_struct_begin()?; let field_ident = prot.read_field_begin()?; if field_ident.field_type == FieldType::Stop { diff --git a/parquet/src/file/column_crypto_metadata.rs b/parquet/src/file/column_crypto_metadata.rs index 5bba07357947..6a538bd42bc0 100644 --- a/parquet/src/file/column_crypto_metadata.rs +++ b/parquet/src/file/column_crypto_metadata.rs @@ -26,8 +26,8 @@ use crate::format::{ EncryptionWithFooterKey as TEncryptionWithFooterKey, }; use crate::parquet_thrift::{ - ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, - WriteThriftField, + read_thrift_vec, ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, + ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, }; use crate::{thrift_struct, thrift_union}; diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 0c4372e38683..6f3a842d0985 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -121,8 +121,8 @@ use crate::{ use crate::{ basic::{ColumnOrder, Compression, Encoding, Type}, parquet_thrift::{ - ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, - WriteThrift, WriteThriftField, + ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, + ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, }, }; use crate::{ diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 7ab2db2f7ff3..f5661f6d0cf3 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -19,7 +19,7 @@ use std::{io::Read, ops::Range}; #[cfg(feature = "encryption")] use crate::encryption::decrypt::{CryptoContext, FileDecryptionProperties}; -use crate::parquet_thrift::ThriftCompactInputProtocol; +use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; use bytes::Bytes; use crate::errors::{ParquetError, Result}; @@ -962,8 +962,8 @@ impl ParquetMetaDataReader { /// /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata pub fn decode_metadata(buf: &[u8]) -> Result { - let mut prot = ThriftCompactInputProtocol::new(buf); - ParquetMetaData::try_from(&mut prot) + let mut prot = ThriftSliceInputProtocol::new(buf); + ParquetMetaData::read_thrift(&mut prot) } } diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 630126ea8be8..b656bacc8c7d 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -35,8 +35,8 @@ use crate::{ statistics::ValueStatistics, }, parquet_thrift::{ - ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, - WriteThrift, WriteThriftField, + read_thrift_vec, ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, + ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, }, schema::types::{parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor}, thrift_struct, thrift_union, @@ -46,6 +46,7 @@ use crate::{ use crate::{ encryption::decrypt::{FileDecryptionProperties, FileDecryptor}, file::column_crypto_metadata::ColumnCryptoMetaData, + parquet_thrift::ThriftSliceInputProtocol, schema::types::SchemaDescPtr, }; @@ -141,6 +142,7 @@ pub(crate) struct DataPageHeaderV2 { ); thrift_struct!( +#[allow(dead_code)] pub(crate) struct PageHeader { /// the type of the page: indicates which of the *_header fields is set 1: required PageType type_ @@ -668,8 +670,8 @@ fn row_group_from_encrypted_thrift( ) })?; - let mut prot = ThriftCompactInputProtocol::new(decrypted_cc_buf.as_slice()); - let col_meta = ColumnMetaData::try_from(&mut prot)?; + let mut prot = ThriftSliceInputProtocol::new(decrypted_cc_buf.as_slice()); + let col_meta = ColumnMetaData::read_thrift(&mut prot)?; c.meta_data = Some(col_meta); columns.push(convert_column(c, d.clone())?); } else { @@ -698,14 +700,14 @@ pub(crate) fn parquet_metadata_with_encryption( encrypted_footer: bool, buf: &[u8], ) -> Result { - let mut prot = ThriftCompactInputProtocol::new(buf); + let mut prot = ThriftSliceInputProtocol::new(buf); let mut file_decryptor = None; let decrypted_fmd_buf; if encrypted_footer { if let Some(file_decryption_properties) = file_decryption_properties { let t_file_crypto_metadata: FileCryptoMetaData = - FileCryptoMetaData::try_from(&mut prot) + FileCryptoMetaData::read_thrift(&mut prot) .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?; let supply_aad_prefix = match &t_file_crypto_metadata.encryption_algorithm { EncryptionAlgorithm::AES_GCM_V1(algo) => algo.supply_aad_prefix, @@ -733,7 +735,7 @@ pub(crate) fn parquet_metadata_with_encryption( "Provided footer key and AAD were unable to decrypt parquet footer" ) })?; - prot = ThriftCompactInputProtocol::new(decrypted_fmd_buf.as_ref()); + prot = ThriftSliceInputProtocol::new(decrypted_fmd_buf.as_ref()); file_decryptor = Some(decryptor); } else { @@ -743,7 +745,7 @@ pub(crate) fn parquet_metadata_with_encryption( } } - let file_meta = super::thrift_gen::FileMetaData::try_from(&mut prot) + let file_meta = super::thrift_gen::FileMetaData::read_thrift(&mut prot) .map_err(|e| general_err!("Could not parse metadata: {}", e))?; let version = file_meta.version; @@ -851,10 +853,9 @@ pub(super) fn get_file_decryptor( /// Create ParquetMetaData from thrift input. Note that this only decodes the file metadata in /// the Parquet footer. Page indexes will need to be added later. -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { - let file_meta = super::thrift_gen::FileMetaData::try_from(prot)?; +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ParquetMetaData { + fn read_thrift(prot: &mut R) -> Result { + let file_meta = super::thrift_gen::FileMetaData::read_thrift(prot)?; let version = file_meta.version; let num_rows = file_meta.num_rows; diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index 2d433dc9b3f1..934e177de0da 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -20,10 +20,10 @@ use std::io::Write; use crate::basic::{Encoding, PageType}; -use crate::errors::{ParquetError, Result}; +use crate::errors::Result; use crate::parquet_thrift::{ - ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, - WriteThriftField, + ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, + WriteThrift, WriteThriftField, }; use crate::thrift_struct; diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index e9cf119224c9..3db597954e6c 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -27,8 +27,8 @@ use crate::file::page_index::column_index::{ use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::ChunkReader; use crate::parquet_thrift::{ - ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, - WriteThriftField, + read_thrift_vec, ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, + ThriftCompactOutputProtocol, ThriftSliceInputProtocol, WriteThrift, WriteThriftField, }; use crate::thrift_struct; use std::io::Write; @@ -136,15 +136,15 @@ pub fn read_offset_indexes( } pub(crate) fn decode_offset_index(data: &[u8]) -> Result { - let mut prot = ThriftCompactInputProtocol::new(data); + let mut prot = ThriftSliceInputProtocol::new(data); // Try to read fast-path first. If that fails, fall back to slower but more robust // decoder. match OffsetIndexMetaData::try_from_fast(&mut prot) { Ok(offset_index) => Ok(offset_index), Err(_) => { - prot = ThriftCompactInputProtocol::new(data); - OffsetIndexMetaData::try_from(&mut prot) + prot = ThriftSliceInputProtocol::new(data); + OffsetIndexMetaData::read_thrift(&mut prot) } } } @@ -166,8 +166,8 @@ pub(crate) fn decode_column_index( data: &[u8], column_type: Type, ) -> Result { - let mut prot = ThriftCompactInputProtocol::new(data); - let index = ThriftColumnIndex::try_from(&mut prot)?; + let mut prot = ThriftSliceInputProtocol::new(data); + let index = ThriftColumnIndex::read_thrift(&mut prot)?; let index = match column_type { Type::BOOLEAN => { diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index ac2620af09d8..2153b8ed3009 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -22,8 +22,8 @@ use std::io::Write; use crate::parquet_thrift::{ - ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, - WriteThriftField, + read_thrift_vec, ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, + ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, }; use crate::{ errors::{ParquetError, Result}, @@ -113,7 +113,9 @@ impl OffsetIndexMetaData { // Fast-path read of offset index. This works because we expect all field deltas to be 1, // and there's no nesting beyond PageLocation, so no need to save the last field id. Like // read_page_locations(), this will fail if absolute field id's are used. - pub(super) fn try_from_fast<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { + pub(super) fn try_from_fast<'a, R: ThriftCompactInputProtocol<'a>>( + prot: &mut R, + ) -> Result { // Offset index is a struct with 2 fields. First field is an array of PageLocations, // the second an optional array of i64. @@ -140,7 +142,7 @@ impl OffsetIndexMetaData { "encountered unknown field while reading OffsetIndex" )); } - let vec = Vec::::try_from(&mut *prot)?; + let vec = read_thrift_vec::(&mut *prot)?; unencoded_byte_array_data_bytes = Some(vec); // this one should be Stop @@ -164,7 +166,7 @@ impl OffsetIndexMetaData { // Note: this will fail if the fields are either out of order, or if a suboptimal // encoder doesn't use field deltas. -fn read_page_location<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +fn read_page_location<'a, R: ThriftCompactInputProtocol<'a>>(prot: &mut R) -> Result { // there are 3 fields, all mandatory, so all field deltas should be 1 let (field_type, delta) = prot.read_field_header()?; if delta != 1 || field_type != FieldType::I64 as u8 { diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index eba279f47c0e..60e2f452f4f2 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -33,10 +33,9 @@ macro_rules! thrift_enum { $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name = $field_value,)* } - impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier { - type Error = ParquetError; + impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier { #[allow(deprecated)] - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { + fn read_thrift(prot: &mut R) -> Result { let val = prot.read_i32()?; match val { $($field_value => Ok(Self::$field_name),)* @@ -105,10 +104,8 @@ macro_rules! thrift_union_all_empty { $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name),* } - impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier { - type Error = ParquetError; - - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { + impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier { + fn read_thrift(prot: &mut R) -> Result { prot.read_struct_begin()?; let field_ident = prot.read_field_begin()?; if field_ident.field_type == FieldType::Stop { @@ -191,10 +188,8 @@ macro_rules! thrift_union { $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name $( ( $crate::__thrift_union_type!{$field_type $($field_lt)? $($element_type)?} ) )?),* } - impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier $(<$lt>)? { - type Error = ParquetError; - - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { + impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier $(<$lt>)? { + fn read_thrift(prot: &mut R) -> Result { prot.read_struct_begin()?; let field_ident = prot.read_field_begin()?; if field_ident.field_type == FieldType::Stop { @@ -279,9 +274,8 @@ macro_rules! thrift_struct { $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $vis $field_name: $crate::__thrift_required_or_optional!($required_or_optional $crate::__thrift_field_type!($field_type $($field_lt)? $($element_type)?))),* } - impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier $(<$lt>)? { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { + impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier $(<$lt>)? { + fn read_thrift(prot: &mut R) -> Result { $(let mut $field_name: Option<$crate::__thrift_field_type!($field_type $($field_lt)? $($element_type)?)> = None;)* prot.read_struct_begin()?; loop { @@ -414,38 +408,38 @@ macro_rules! __thrift_result_required_or_optional { #[macro_export] macro_rules! __thrift_read_field { ($prot:tt, list $lt:lifetime binary) => { - Vec::<&'a [u8]>::try_from(&mut *$prot)? + read_thrift_vec::<&'a [u8], R>(&mut *$prot)? }; ($prot:tt, list $lt:lifetime $element_type:ident) => { - Vec::<$element_type>::try_from(&mut *$prot)? + read_thrift_vec::<$element_type, R>(&mut *$prot)? }; ($prot:tt, list string) => { - Vec::::try_from(&mut *$prot)? + read_thrift_vec::(&mut *$prot)? }; ($prot:tt, list $element_type:ident) => { - Vec::<$element_type>::try_from(&mut *$prot)? + read_thrift_vec::<$element_type, R>(&mut *$prot)? }; ($prot:tt, string $lt:lifetime) => { - <&$lt str>::try_from(&mut *$prot)? + <&$lt str>::read_thrift(&mut *$prot)? }; ($prot:tt, binary $lt:lifetime) => { - <&$lt [u8]>::try_from(&mut *$prot)? + <&$lt [u8]>::read_thrift(&mut *$prot)? }; ($prot:tt, $field_type:ident $lt:lifetime) => { - $field_type::try_from(&mut *$prot)? + $field_type::read_thrift(&mut *$prot)? }; ($prot:tt, string) => { - String::try_from(&mut *$prot)? + String::read_thrift(&mut *$prot)? }; ($prot:tt, binary) => { // this one needs to not conflict with `list` $prot.read_bytes()?.to_vec() }; ($prot:tt, double) => { - $crate::parquet_thrift::OrderedF64::try_from(&mut *$prot)? + $crate::parquet_thrift::OrderedF64::read_thrift(&mut *$prot)? }; ($prot:tt, $field_type:ident) => { - $field_type::try_from(&mut *$prot)? + $field_type::read_thrift(&mut *$prot)? }; } @@ -478,10 +472,10 @@ macro_rules! __thrift_union_type { #[macro_export] macro_rules! __thrift_read_variant { ($prot:tt, $field_name:ident $field_type:ident) => { - Self::$field_name($field_type::try_from(&mut *$prot)?) + Self::$field_name($field_type::read_thrift(&mut *$prot)?) }; ($prot:tt, $field_name:ident list $field_type:ident) => { - Self::$field_name(Vec::<$field_type>::try_from(&mut *$prot)?) + Self::$field_name(Vec::<$field_type>::read_thrift(&mut *$prot)?) }; ($prot:tt, $field_name:ident) => {{ $prot.skip_empty_struct()?; diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index ac5d72ecdd69..29e209e2f21f 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -167,41 +167,12 @@ pub(crate) struct ListIdentifier { pub(crate) size: i32, } -/// A more performant implementation of [`TCompactInputProtocol`] that reads a slice -/// -/// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol -pub(crate) struct ThriftCompactInputProtocol<'a> { - buf: &'a [u8], - // Identifier of the last field deserialized for a struct. - last_read_field_id: i16, - // Stack of the last read field ids (a new entry is added each time a nested struct is read). - read_field_id_stack: Vec, - // Boolean value for a field. - // Saved because boolean fields and their value are encoded in a single byte, - // and reading the field only occurs after the field id is read. - pending_read_bool_value: Option, -} +pub(crate) trait ThriftCompactInputProtocol<'a> { + fn read_byte(&mut self) -> Result; -impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> { - pub fn new(buf: &'a [u8]) -> Self { - Self { - buf, - last_read_field_id: 0, - read_field_id_stack: Vec::with_capacity(16), - pending_read_bool_value: None, - } - } + fn read_bytes(&mut self) -> Result<&'a [u8]>; - pub fn reset_buffer(&mut self, buf: &'a [u8]) { - self.buf = buf; - self.last_read_field_id = 0; - self.read_field_id_stack.clear(); - self.pending_read_bool_value = None; - } - - pub fn as_slice(&self) -> &'a [u8] { - self.buf - } + fn skip_bytes(&mut self, n: usize) -> Result<()>; fn read_vlq(&mut self) -> Result { let mut in_progress = 0; @@ -221,7 +192,7 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> { Ok((val >> 1) as i64 ^ -((val & 1) as i64)) } - fn read_list_set_begin(&mut self) -> Result<(ElementType, i32)> { + fn read_list_begin(&mut self) -> Result { let header = self.read_byte()?; let element_type = ElementType::try_from(header & 0x0f)?; @@ -233,22 +204,17 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> { self.read_vlq()? as _ }; - Ok((element_type, element_count)) + Ok(ListIdentifier { + element_type, + size: element_count, + }) } - pub(crate) fn read_struct_begin(&mut self) -> Result<()> { - self.read_field_id_stack.push(self.last_read_field_id); - self.last_read_field_id = 0; - Ok(()) - } + fn read_struct_begin(&mut self) -> Result<()>; - pub(crate) fn read_struct_end(&mut self) -> Result<()> { - self.last_read_field_id = self - .read_field_id_stack - .pop() - .expect("should have previous field ids"); - Ok(()) - } + fn read_struct_end(&mut self) -> Result<()>; + + fn read_field_begin(&mut self) -> Result; // This is a specialized version of read_field_begin, solely for use in parsing // PageLocation structs in the offset index. This function assumes that the delta @@ -256,138 +222,37 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> { // will be read. This also skips validation of the field type. // // Returns a tuple of (field_type, field_delta) - pub(crate) fn read_field_header(&mut self) -> Result<(u8, u8)> { + fn read_field_header(&mut self) -> Result<(u8, u8)> { let field_type = self.read_byte()?; let field_delta = (field_type & 0xf0) >> 4; let field_type = field_type & 0xf; Ok((field_type, field_delta)) } - pub(crate) fn read_field_begin(&mut self) -> Result { - // we can read at least one byte, which is: - // - the type - // - the field delta and the type - let field_type = self.read_byte()?; - let field_delta = (field_type & 0xf0) >> 4; - let field_type = FieldType::try_from(field_type & 0xf)?; + fn read_bool(&mut self) -> Result; - match field_type { - FieldType::Stop => Ok(FieldIdentifier { - field_type: FieldType::Stop, - id: 0, - }), - _ => { - // special handling for bools - if field_type == FieldType::BooleanFalse { - self.pending_read_bool_value = Some(false); - } else if field_type == FieldType::BooleanTrue { - self.pending_read_bool_value = Some(true); - } - if field_delta != 0 { - self.last_read_field_id = self - .last_read_field_id - .checked_add(field_delta as i16) - .map_or_else( - || { - Err(general_err!(format!( - "cannot add {} to {}", - field_delta, self.last_read_field_id - ))) - }, - Ok, - )?; - } else { - self.last_read_field_id = self.read_i16()?; - }; - - Ok(FieldIdentifier { - field_type, - id: self.last_read_field_id, - }) - } - } - } - - pub(crate) fn read_bool(&mut self) -> Result { - match self.pending_read_bool_value.take() { - Some(b) => Ok(b), - None => { - let b = self.read_byte()?; - // Previous versions of the thrift specification said to use 0 and 1 inside collections, - // but that differed from existing implementations. - // The specification was updated in https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8. - // At least the go implementation seems to have followed the previously documented values. - match b { - 0x01 => Ok(true), - 0x00 | 0x02 => Ok(false), - unkn => Err(general_err!(format!("cannot convert {unkn} into bool"))), - } - } - } - } - - pub(crate) fn read_bytes(&mut self) -> Result<&'b [u8]> { - let len = self.read_vlq()? as usize; - let ret = self.buf.get(..len).ok_or_else(eof_error)?; - self.buf = &self.buf[len..]; - Ok(ret) - } - - pub(crate) fn read_string(&mut self) -> Result<&'b str> { + fn read_string(&mut self) -> Result<&'a str> { let slice = self.read_bytes()?; Ok(std::str::from_utf8(slice)?) } - pub(crate) fn read_i8(&mut self) -> Result { + fn read_i8(&mut self) -> Result { Ok(self.read_byte()? as _) } - pub(crate) fn read_i16(&mut self) -> Result { + fn read_i16(&mut self) -> Result { Ok(self.read_zig_zag()? as _) } - pub(crate) fn read_i32(&mut self) -> Result { + fn read_i32(&mut self) -> Result { Ok(self.read_zig_zag()? as _) } - pub(crate) fn read_i64(&mut self) -> Result { + fn read_i64(&mut self) -> Result { self.read_zig_zag() } - pub(crate) fn read_double(&mut self) -> Result { - let slice = self.buf.get(..8).ok_or_else(eof_error)?; - self.buf = &self.buf[8..]; - match slice.try_into() { - Ok(slice) => Ok(f64::from_le_bytes(slice)), - Err(_) => Err(general_err!("Unexpected error converting slice")), - } - } - - pub(crate) fn read_list_begin(&mut self) -> Result { - let (element_type, element_count) = self.read_list_set_begin()?; - Ok(ListIdentifier { - element_type, - size: element_count, - }) - } - - pub(crate) fn read_list_end(&mut self) -> Result<()> { - Ok(()) - } - - #[inline] - fn read_byte(&mut self) -> Result { - let ret = *self.buf.first().ok_or_else(eof_error)?; - self.buf = &self.buf[1..]; - Ok(ret) - } - - #[inline] - fn skip_bytes(&mut self, n: usize) -> Result<()> { - self.buf.get(..n).ok_or_else(eof_error)?; - self.buf = &self.buf[n..]; - Ok(()) - } + fn read_double(&mut self) -> Result; fn skip_vlq(&mut self) -> Result<()> { loop { @@ -405,14 +270,14 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> { /// Skip a field with type `field_type` recursively until the default /// maximum skip depth is reached. - pub(crate) fn skip(&mut self, field_type: FieldType) -> Result<()> { + fn skip(&mut self, field_type: FieldType) -> Result<()> { // TODO: magic number self.skip_till_depth(field_type, 64) } /// Empty structs in unions consist of a single byte of 0 for the field stop record. /// This skips that byte without pushing to the field id stack. - pub(crate) fn skip_empty_struct(&mut self) -> Result<()> { + fn skip_empty_struct(&mut self) -> Result<()> { let b = self.read_byte()?; if b != 0 { Err(general_err!("Empty struct has fields")) @@ -452,7 +317,7 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> { let element_type = FieldType::try_from(list_ident.element_type)?; self.skip_till_depth(element_type, depth - 1)?; } - self.read_list_end() + Ok(()) } // no list or map types in parquet format u => Err(general_err!(format!("cannot skip field type {:?}", &u))), @@ -460,90 +325,226 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> { } } +pub(crate) struct ThriftSliceInputProtocol<'a> { + buf: &'a [u8], + // Identifier of the last field deserialized for a struct. + last_read_field_id: i16, + // Stack of the last read field ids (a new entry is added each time a nested struct is read). + read_field_id_stack: Vec, + // Boolean value for a field. + // Saved because boolean fields and their value are encoded in a single byte, + // and reading the field only occurs after the field id is read. + pending_read_bool_value: Option, +} + +impl<'a> ThriftSliceInputProtocol<'a> { + pub fn new(buf: &'a [u8]) -> Self { + Self { + buf, + last_read_field_id: 0, + read_field_id_stack: Vec::with_capacity(16), + pending_read_bool_value: None, + } + } + + pub fn reset_buffer(&mut self, buf: &'a [u8]) { + self.buf = buf; + self.last_read_field_id = 0; + self.read_field_id_stack.clear(); + self.pending_read_bool_value = None; + } + + pub fn as_slice(&self) -> &'a [u8] { + self.buf + } +} + +impl<'b, 'a: 'b> ThriftCompactInputProtocol<'b> for ThriftSliceInputProtocol<'a> { + #[inline] + fn read_byte(&mut self) -> Result { + let ret = *self.buf.first().ok_or_else(eof_error)?; + self.buf = &self.buf[1..]; + Ok(ret) + } + + fn read_bytes(&mut self) -> Result<&'b [u8]> { + let len = self.read_vlq()? as usize; + let ret = self.buf.get(..len).ok_or_else(eof_error)?; + self.buf = &self.buf[len..]; + Ok(ret) + } + + #[inline] + fn skip_bytes(&mut self, n: usize) -> Result<()> { + self.buf.get(..n).ok_or_else(eof_error)?; + self.buf = &self.buf[n..]; + Ok(()) + } + + fn read_double(&mut self) -> Result { + let slice = self.buf.get(..8).ok_or_else(eof_error)?; + self.buf = &self.buf[8..]; + match slice.try_into() { + Ok(slice) => Ok(f64::from_le_bytes(slice)), + Err(_) => Err(general_err!("Unexpected error converting slice")), + } + } + + fn read_struct_begin(&mut self) -> Result<()> { + self.read_field_id_stack.push(self.last_read_field_id); + self.last_read_field_id = 0; + Ok(()) + } + + fn read_struct_end(&mut self) -> Result<()> { + self.last_read_field_id = self + .read_field_id_stack + .pop() + .expect("should have previous field ids"); + Ok(()) + } + + fn read_field_begin(&mut self) -> Result { + // we can read at least one byte, which is: + // - the type + // - the field delta and the type + let field_type = self.read_byte()?; + let field_delta = (field_type & 0xf0) >> 4; + let field_type = FieldType::try_from(field_type & 0xf)?; + + match field_type { + FieldType::Stop => Ok(FieldIdentifier { + field_type: FieldType::Stop, + id: 0, + }), + _ => { + // special handling for bools + if field_type == FieldType::BooleanFalse { + self.pending_read_bool_value = Some(false); + } else if field_type == FieldType::BooleanTrue { + self.pending_read_bool_value = Some(true); + } + if field_delta != 0 { + self.last_read_field_id = self + .last_read_field_id + .checked_add(field_delta as i16) + .map_or_else( + || { + Err(general_err!(format!( + "cannot add {} to {}", + field_delta, self.last_read_field_id + ))) + }, + Ok, + )?; + } else { + self.last_read_field_id = self.read_i16()?; + }; + + Ok(FieldIdentifier { + field_type, + id: self.last_read_field_id, + }) + } + } + } + + fn read_bool(&mut self) -> Result { + match self.pending_read_bool_value.take() { + Some(b) => Ok(b), + None => { + let b = self.read_byte()?; + // Previous versions of the thrift specification said to use 0 and 1 inside collections, + // but that differed from existing implementations. + // The specification was updated in https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8. + // At least the go implementation seems to have followed the previously documented values. + match b { + 0x01 => Ok(true), + 0x00 | 0x02 => Ok(false), + unkn => Err(general_err!(format!("cannot convert {unkn} into bool"))), + } + } + } + } +} + fn eof_error() -> ParquetError { eof_err!("Unexpected EOF") } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for bool { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +pub(crate) trait ReadThrift<'a, R: ThriftCompactInputProtocol<'a>> { + // used to read generated enums and structs + fn read_thrift(prot: &mut R) -> Result + where + Self: Sized; +} + +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for bool { + fn read_thrift(prot: &mut R) -> Result { prot.read_bool() } } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for i8 { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for i8 { + fn read_thrift(prot: &mut R) -> Result { prot.read_i8() } } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for i16 { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for i16 { + fn read_thrift(prot: &mut R) -> Result { prot.read_i16() } } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for i32 { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for i32 { + fn read_thrift(prot: &mut R) -> Result { prot.read_i32() } } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for i64 { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for i64 { + fn read_thrift(prot: &mut R) -> Result { prot.read_i64() } } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for OrderedF64 { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for OrderedF64 { + fn read_thrift(prot: &mut R) -> Result { Ok(OrderedF64(prot.read_double()?)) } } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for &'a str { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for &'a str { + fn read_thrift(prot: &mut R) -> Result { prot.read_string() } } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for String { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for String { + fn read_thrift(prot: &mut R) -> Result { Ok(prot.read_string()?.to_owned()) } } -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for &'a [u8] { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for &'a [u8] { + fn read_thrift(prot: &mut R) -> Result { prot.read_bytes() } } -impl<'a, T> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Vec +pub(crate) fn read_thrift_vec<'a, T, R>(prot: &mut R) -> Result> where - T: for<'b> TryFrom<&'b mut ThriftCompactInputProtocol<'a>>, - ParquetError: for<'b> From<>>::Error>, + R: ThriftCompactInputProtocol<'a>, + T: ReadThrift<'a, R>, { - type Error = ParquetError; - - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result { - let list_ident = prot.read_list_begin()?; - let mut res = Vec::with_capacity(list_ident.size as usize); - for _ in 0..list_ident.size { - let val = T::try_from(prot)?; - res.push(val); - } - - Ok(res) + let list_ident = prot.read_list_begin()?; + let mut res = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let val = T::read_thrift(prot)?; + res.push(val); } + Ok(res) } ///////////////////////// @@ -900,11 +901,11 @@ pub(crate) mod tests { pub(crate) fn test_roundtrip(val: T) where - T: for<'a> TryFrom<&'a mut ThriftCompactInputProtocol<'a>> + T: for<'a> ReadThrift<'a, ThriftSliceInputProtocol<'a>> + WriteThrift> + PartialEq + Debug, - for<'a> >>::Error: Debug, + //for<'a> >>::Error: Debug, { let buf = Vec::::new(); let mut writer = ThriftCompactOutputProtocol::new(buf); @@ -912,8 +913,8 @@ pub(crate) mod tests { //println!("serialized: {:x?}", writer.inner()); - let mut prot = ThriftCompactInputProtocol::new(writer.inner()); - let read_val = T::try_from(&mut prot).unwrap(); + let mut prot = ThriftSliceInputProtocol::new(writer.inner()); + let read_val = T::read_thrift(&mut prot).unwrap(); assert_eq!(val, read_val); } From b31c9e69c0dcb75cf45a1ec4491bdf0808461293 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 27 Aug 2025 11:54:37 -0700 Subject: [PATCH 036/126] rework struct field reading --- parquet/src/basic.rs | 13 +-- parquet/src/parquet_macros.rs | 45 +++++----- parquet/src/parquet_thrift.rs | 162 +++++++++++++--------------------- 3 files changed, 87 insertions(+), 133 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 8cf6b5f85b8b..4aeca93cfbde 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -362,9 +362,7 @@ pub enum LogicalType { impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType { fn read_thrift(prot: &mut R) -> Result { - prot.read_struct_begin()?; - - let field_ident = prot.read_field_begin()?; + let field_ident = prot.read_field_begin(0)?; if field_ident.field_type == FieldType::Stop { return Err(general_err!("received empty union from remote LogicalType")); } @@ -463,13 +461,12 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType { } } }; - let field_ident = prot.read_field_begin()?; + let field_ident = prot.read_field_begin(field_ident.id)?; if field_ident.field_type != FieldType::Stop { return Err(general_err!( "Received multiple fields for union from remote LogicalType" )); } - prot.read_struct_end()?; Ok(ret) } } @@ -1122,8 +1119,7 @@ impl ColumnOrder { impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ColumnOrder { fn read_thrift(prot: &mut R) -> Result { - prot.read_struct_begin()?; - let field_ident = prot.read_field_begin()?; + let field_ident = prot.read_field_begin(0)?; if field_ident.field_type == FieldType::Stop { return Err(general_err!("Received empty union from remote ColumnOrder")); } @@ -1138,13 +1134,12 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ColumnOrder { Self::UNKNOWN } }; - let field_ident = prot.read_field_begin()?; + let field_ident = prot.read_field_begin(field_ident.id)?; if field_ident.field_type != FieldType::Stop { return Err(general_err!( "Received multiple fields for union from remote ColumnOrder" )); } - prot.read_struct_end()?; Ok(ret) } } diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 60e2f452f4f2..3941d84c0dda 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -106,8 +106,7 @@ macro_rules! thrift_union_all_empty { impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier { fn read_thrift(prot: &mut R) -> Result { - prot.read_struct_begin()?; - let field_ident = prot.read_field_begin()?; + let field_ident = prot.read_field_begin(0)?; if field_ident.field_type == FieldType::Stop { return Err(general_err!("Received empty union from remote {}", stringify!($identifier))); } @@ -121,13 +120,12 @@ macro_rules! thrift_union_all_empty { return Err(general_err!("Unexpected {} {}", stringify!($identifier), field_ident.id)); } }; - let field_ident = prot.read_field_begin()?; + let field_ident = prot.read_field_begin(field_ident.id)?; if field_ident.field_type != FieldType::Stop { return Err(general_err!( "Received multiple fields for union from remote {}", stringify!($identifier) )); } - prot.read_struct_end()?; Ok(ret) } } @@ -190,8 +188,7 @@ macro_rules! thrift_union { impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier $(<$lt>)? { fn read_thrift(prot: &mut R) -> Result { - prot.read_struct_begin()?; - let field_ident = prot.read_field_begin()?; + let field_ident = prot.read_field_begin(0)?; if field_ident.field_type == FieldType::Stop { return Err(general_err!("Received empty union from remote {}", stringify!($identifier))); } @@ -204,13 +201,12 @@ macro_rules! thrift_union { return Err(general_err!("Unexpected {} {}", stringify!($identifier), field_ident.id)); } }; - let field_ident = prot.read_field_begin()?; + let field_ident = prot.read_field_begin(field_ident.id)?; if field_ident.field_type != FieldType::Stop { return Err(general_err!( concat!("Received multiple fields for union from remote {}", stringify!($identifier)) )); } - prot.read_struct_end()?; Ok(ret) } } @@ -277,23 +273,23 @@ macro_rules! thrift_struct { impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier $(<$lt>)? { fn read_thrift(prot: &mut R) -> Result { $(let mut $field_name: Option<$crate::__thrift_field_type!($field_type $($field_lt)? $($element_type)?)> = None;)* - prot.read_struct_begin()?; + let mut last_field_id = 0i16; loop { - let field_ident = prot.read_field_begin()?; + let field_ident = prot.read_field_begin(last_field_id)?; if field_ident.field_type == FieldType::Stop { break; } match field_ident.id { $($field_id => { - let val = $crate::__thrift_read_field!(prot, $field_type $($field_lt)? $($element_type)?); + let val = $crate::__thrift_read_field!(prot, field_ident, $field_type $($field_lt)? $($element_type)?); $field_name = Some(val); })* _ => { prot.skip(field_ident.field_type)?; } }; + last_field_id = field_ident.id; } - prot.read_struct_end()?; $($crate::__thrift_result_required_or_optional!($required_or_optional $field_name);)* Ok(Self { $($field_name),* @@ -407,38 +403,41 @@ macro_rules! __thrift_result_required_or_optional { #[doc(hidden)] #[macro_export] macro_rules! __thrift_read_field { - ($prot:tt, list $lt:lifetime binary) => { + ($prot:tt, $field_ident:tt, list $lt:lifetime binary) => { read_thrift_vec::<&'a [u8], R>(&mut *$prot)? }; - ($prot:tt, list $lt:lifetime $element_type:ident) => { + ($prot:tt, $field_ident:tt, list $lt:lifetime $element_type:ident) => { read_thrift_vec::<$element_type, R>(&mut *$prot)? }; - ($prot:tt, list string) => { + ($prot:tt, $field_ident:tt, list string) => { read_thrift_vec::(&mut *$prot)? }; - ($prot:tt, list $element_type:ident) => { + ($prot:tt, $field_ident:tt, list $element_type:ident) => { read_thrift_vec::<$element_type, R>(&mut *$prot)? }; - ($prot:tt, string $lt:lifetime) => { + ($prot:tt, $field_ident:tt, string $lt:lifetime) => { <&$lt str>::read_thrift(&mut *$prot)? }; - ($prot:tt, binary $lt:lifetime) => { + ($prot:tt, $field_ident:tt, binary $lt:lifetime) => { <&$lt [u8]>::read_thrift(&mut *$prot)? }; - ($prot:tt, $field_type:ident $lt:lifetime) => { + ($prot:tt, $field_ident:tt, $field_type:ident $lt:lifetime) => { $field_type::read_thrift(&mut *$prot)? }; - ($prot:tt, string) => { + ($prot:tt, $field_ident:tt, string) => { String::read_thrift(&mut *$prot)? }; - ($prot:tt, binary) => { + ($prot:tt, $field_ident:tt, binary) => { // this one needs to not conflict with `list` $prot.read_bytes()?.to_vec() }; - ($prot:tt, double) => { + ($prot:tt, $field_ident:tt, double) => { $crate::parquet_thrift::OrderedF64::read_thrift(&mut *$prot)? }; - ($prot:tt, $field_type:ident) => { + ($prot:tt, $field_ident:tt, bool) => { + $field_ident.bool_val.unwrap() + }; + ($prot:tt, $field_ident:tt, $field_type:ident) => { $field_type::read_thrift(&mut *$prot)? }; } diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 29e209e2f21f..b38f6780183f 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -159,6 +159,7 @@ impl TryFrom for ElementType { pub(crate) struct FieldIdentifier { pub(crate) field_type: FieldType, pub(crate) id: i16, + pub(crate) bool_val: Option, } #[derive(Clone, Debug, Eq, PartialEq)] @@ -210,11 +211,50 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { }) } - fn read_struct_begin(&mut self) -> Result<()>; + fn read_field_begin(&mut self, last_field_id: i16) -> Result { + // we can read at least one byte, which is: + // - the type + // - the field delta and the type + let field_type = self.read_byte()?; + let field_delta = (field_type & 0xf0) >> 4; + let field_type = FieldType::try_from(field_type & 0xf)?; + let mut bool_val: Option = None; - fn read_struct_end(&mut self) -> Result<()>; + match field_type { + FieldType::Stop => Ok(FieldIdentifier { + field_type: FieldType::Stop, + id: 0, + bool_val, + }), + _ => { + // special handling for bools + if field_type == FieldType::BooleanFalse { + bool_val = Some(false); + } else if field_type == FieldType::BooleanTrue { + bool_val = Some(true); + } + let field_id = if field_delta != 0 { + last_field_id.checked_add(field_delta as i16).map_or_else( + || { + Err(general_err!(format!( + "cannot add {} to {}", + field_delta, last_field_id + ))) + }, + Ok, + )? + } else { + self.read_i16()? + }; - fn read_field_begin(&mut self) -> Result; + Ok(FieldIdentifier { + field_type, + id: field_id, + bool_val, + }) + } + } + } // This is a specialized version of read_field_begin, solely for use in parsing // PageLocation structs in the offset index. This function assumes that the delta @@ -229,7 +269,19 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { Ok((field_type, field_delta)) } - fn read_bool(&mut self) -> Result; + // not to be used for bool struct fields, just for bool arrays + fn read_bool(&mut self) -> Result { + let b = self.read_byte()?; + // Previous versions of the thrift specification said to use 0 and 1 inside collections, + // but that differed from existing implementations. + // The specification was updated in https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8. + // At least the go implementation seems to have followed the previously documented values. + match b { + 0x01 => Ok(true), + 0x00 | 0x02 => Ok(false), + unkn => Err(general_err!(format!("cannot convert {unkn} into bool"))), + } + } fn read_string(&mut self) -> Result<&'a str> { let slice = self.read_bytes()?; @@ -301,15 +353,16 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { FieldType::Double => self.skip_bytes(8).map(|_| ()), FieldType::Binary => self.skip_binary().map(|_| ()), FieldType::Struct => { - self.read_struct_begin()?; + let mut last_field_id = 0i16; loop { - let field_ident = self.read_field_begin()?; + let field_ident = self.read_field_begin(last_field_id)?; if field_ident.field_type == FieldType::Stop { break; } self.skip_till_depth(field_ident.field_type, depth - 1)?; + last_field_id = field_ident.id; } - self.read_struct_end() + Ok(()) } FieldType::List => { let list_ident = self.read_list_begin()?; @@ -327,31 +380,15 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { pub(crate) struct ThriftSliceInputProtocol<'a> { buf: &'a [u8], - // Identifier of the last field deserialized for a struct. - last_read_field_id: i16, - // Stack of the last read field ids (a new entry is added each time a nested struct is read). - read_field_id_stack: Vec, - // Boolean value for a field. - // Saved because boolean fields and their value are encoded in a single byte, - // and reading the field only occurs after the field id is read. - pending_read_bool_value: Option, } impl<'a> ThriftSliceInputProtocol<'a> { pub fn new(buf: &'a [u8]) -> Self { - Self { - buf, - last_read_field_id: 0, - read_field_id_stack: Vec::with_capacity(16), - pending_read_bool_value: None, - } + Self { buf } } pub fn reset_buffer(&mut self, buf: &'a [u8]) { self.buf = buf; - self.last_read_field_id = 0; - self.read_field_id_stack.clear(); - self.pending_read_bool_value = None; } pub fn as_slice(&self) -> &'a [u8] { @@ -389,83 +426,6 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'b> for ThriftSliceInputProtocol<'a> Err(_) => Err(general_err!("Unexpected error converting slice")), } } - - fn read_struct_begin(&mut self) -> Result<()> { - self.read_field_id_stack.push(self.last_read_field_id); - self.last_read_field_id = 0; - Ok(()) - } - - fn read_struct_end(&mut self) -> Result<()> { - self.last_read_field_id = self - .read_field_id_stack - .pop() - .expect("should have previous field ids"); - Ok(()) - } - - fn read_field_begin(&mut self) -> Result { - // we can read at least one byte, which is: - // - the type - // - the field delta and the type - let field_type = self.read_byte()?; - let field_delta = (field_type & 0xf0) >> 4; - let field_type = FieldType::try_from(field_type & 0xf)?; - - match field_type { - FieldType::Stop => Ok(FieldIdentifier { - field_type: FieldType::Stop, - id: 0, - }), - _ => { - // special handling for bools - if field_type == FieldType::BooleanFalse { - self.pending_read_bool_value = Some(false); - } else if field_type == FieldType::BooleanTrue { - self.pending_read_bool_value = Some(true); - } - if field_delta != 0 { - self.last_read_field_id = self - .last_read_field_id - .checked_add(field_delta as i16) - .map_or_else( - || { - Err(general_err!(format!( - "cannot add {} to {}", - field_delta, self.last_read_field_id - ))) - }, - Ok, - )?; - } else { - self.last_read_field_id = self.read_i16()?; - }; - - Ok(FieldIdentifier { - field_type, - id: self.last_read_field_id, - }) - } - } - } - - fn read_bool(&mut self) -> Result { - match self.pending_read_bool_value.take() { - Some(b) => Ok(b), - None => { - let b = self.read_byte()?; - // Previous versions of the thrift specification said to use 0 and 1 inside collections, - // but that differed from existing implementations. - // The specification was updated in https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8. - // At least the go implementation seems to have followed the previously documented values. - match b { - 0x01 => Ok(true), - 0x00 | 0x02 => Ok(false), - unkn => Err(general_err!(format!("cannot convert {unkn} into bool"))), - } - } - } - } } fn eof_error() -> ParquetError { From 8c4e49df507034c3ccb79d6d519b344a8c5ced27 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 27 Aug 2025 12:26:46 -0700 Subject: [PATCH 037/126] fix skipping bool fields --- parquet/src/parquet_thrift.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index b38f6780183f..29cf4a3d7c88 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -345,7 +345,8 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { } match field_type { - FieldType::BooleanFalse | FieldType::BooleanTrue => self.read_bool().map(|_| ()), + // boolean field has no data + FieldType::BooleanFalse | FieldType::BooleanTrue => Ok(()), FieldType::Byte => self.read_i8().map(|_| ()), FieldType::I16 => self.skip_vlq().map(|_| ()), FieldType::I32 => self.skip_vlq().map(|_| ()), From e0e18529c7d6fce96f9a95f01a16dda2c6b18092 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 27 Aug 2025 12:39:36 -0700 Subject: [PATCH 038/126] remove cruft --- parquet/src/parquet_thrift.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 29cf4a3d7c88..8b2ab4943a50 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -866,7 +866,6 @@ pub(crate) mod tests { + WriteThrift> + PartialEq + Debug, - //for<'a> >>::Error: Debug, { let buf = Vec::::new(); let mut writer = ThriftCompactOutputProtocol::new(buf); From d8081a9388629fe45089e62d192eab90e6441f58 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 27 Aug 2025 17:35:45 -0700 Subject: [PATCH 039/126] fix clippy issues --- parquet/src/file/metadata/thrift_gen.rs | 2 +- parquet/src/parquet_thrift.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 1dc829e5cfe2..f15a5a6b16d8 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -548,7 +548,7 @@ mod tests { ymax: 128.5.into(), zmin: Some(11.0.into()), zmax: Some(1300.0.into()), - mmin: Some(3.14.into()), + mmin: Some(3.7.into()), mmax: Some(42.0.into()), }); } diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 4f04d990860e..ac5d72ecdd69 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -665,8 +665,8 @@ where fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_list_begin(T::ELEMENT_TYPE, self.len())?; - for i in 0..self.len() { - self[i].write_thrift(writer)?; + for item in self { + item.write_thrift(writer)?; } Ok(()) } From 5d6c8b1303ece5f98984d78067796e372b5291bc Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 27 Aug 2025 17:45:01 -0700 Subject: [PATCH 040/126] allow unused page header structs --- parquet/src/file/metadata/thrift_gen.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 630126ea8be8..06229fb1812f 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -141,6 +141,7 @@ pub(crate) struct DataPageHeaderV2 { ); thrift_struct!( +#[allow(dead_code)] pub(crate) struct PageHeader { /// the type of the page: indicates which of the *_header fields is set 1: required PageType type_ From 709e8130f9f6eda29290dfbd8907e0ab7cd143fc Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 28 Aug 2025 18:50:14 -0700 Subject: [PATCH 041/126] remove Write from WriteThrift --- parquet/src/basic.rs | 28 +++++------ parquet/src/parquet_macros.rs | 32 ++++++------ parquet/src/parquet_thrift.rs | 94 +++++++++++++++++------------------ 3 files changed, 77 insertions(+), 77 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index cf451b961f69..5fd49043731e 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -197,17 +197,17 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ConvertedType { } } -impl WriteThrift for ConvertedType { +impl WriteThrift for ConvertedType { const ELEMENT_TYPE: ElementType = ElementType::I32; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { // because we've added NONE, the variant values are off by 1, so correct that here writer.write_i32(*self as i32 - 1) } } -impl WriteThriftField for ConvertedType { - fn write_thrift_field( +impl WriteThriftField for ConvertedType { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -476,10 +476,10 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType { } } -impl WriteThrift for LogicalType { +impl WriteThrift for LogicalType { const ELEMENT_TYPE: ElementType = ElementType::Struct; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { match self { Self::String => { writer.write_empty_struct(1, 0)?; @@ -575,8 +575,8 @@ impl WriteThrift for LogicalType { } } -impl WriteThriftField for LogicalType { - fn write_thrift_field( +impl WriteThriftField for LogicalType { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -776,10 +776,10 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Compression { // FIXME // ugh...why did we add compression level to some variants if we don't use them???? -impl WriteThrift for Compression { +impl WriteThrift for Compression { const ELEMENT_TYPE: ElementType = ElementType::I32; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let id: i32 = match *self { Self::UNCOMPRESSED => 0, Self::SNAPPY => 1, @@ -794,8 +794,8 @@ impl WriteThrift for Compression { } } -impl WriteThriftField for Compression { - fn write_thrift_field( +impl WriteThriftField for Compression { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -1154,10 +1154,10 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ColumnOrder { } } -impl WriteThrift for ColumnOrder { +impl WriteThrift for ColumnOrder { const ELEMENT_TYPE: ElementType = ElementType::Struct; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { match *self { Self::TYPE_DEFINED_ORDER(_) => { writer.write_field_begin(FieldType::Struct, 1, 0)?; diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 41a5bf3b43f9..ae1d772a07cb 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -51,16 +51,16 @@ macro_rules! thrift_enum { } } - impl WriteThrift for $identifier { + impl WriteThrift for $identifier { const ELEMENT_TYPE: ElementType = ElementType::I32; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_i32(*self as i32) } } - impl WriteThriftField for $identifier { - fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { + impl WriteThriftField for $identifier { + fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { writer.write_field_begin(FieldType::I32, field_id, last_field_id)?; self.write_thrift(writer)?; Ok(field_id) @@ -135,10 +135,10 @@ macro_rules! thrift_union_all_empty { } } - impl WriteThrift for $identifier { + impl WriteThrift for $identifier { const ELEMENT_TYPE: ElementType = ElementType::Struct; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { match *self { $(Self::$field_name => writer.write_empty_struct($field_id, 0)?,)* }; @@ -147,8 +147,8 @@ macro_rules! thrift_union_all_empty { } } - impl WriteThriftField for $identifier { - fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { + impl WriteThriftField for $identifier { + fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; self.write_thrift(writer)?; Ok(field_id) @@ -220,10 +220,10 @@ macro_rules! thrift_union { } } - impl WriteThrift for $identifier { + impl WriteThrift for $identifier { const ELEMENT_TYPE: ElementType = ElementType::Struct; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { match self { $($crate::__thrift_write_variant_lhs!($field_name $($field_type)?, variant_val) => $crate::__thrift_write_variant_rhs!($field_id $($field_type)?, writer, variant_val),)* @@ -232,8 +232,8 @@ macro_rules! thrift_union { } } - impl WriteThriftField for $identifier { - fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { + impl WriteThriftField for $identifier { + fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; self.write_thrift(writer)?; Ok(field_id) @@ -307,19 +307,19 @@ macro_rules! thrift_struct { } } - impl<$($lt,)? W: Write> WriteThrift for $identifier $(<$lt>)? { + impl $(<$lt>)? WriteThrift for $identifier $(<$lt>)? { const ELEMENT_TYPE: ElementType = ElementType::Struct; #[allow(unused_assignments)] - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; $($crate::__thrift_write_required_or_optional_field!($required_or_optional $field_name, $field_id, $field_type, self, writer, last_field_id);)* writer.write_struct_end() } } - impl<$($lt,)? W: Write> WriteThriftField for $identifier $(<$lt>)? { - fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { + impl $(<$lt>)? WriteThriftField for $identifier $(<$lt>)? { + fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; self.write_thrift(writer)?; Ok(field_id) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index ac5d72ecdd69..593aec4e0f2b 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -650,20 +650,20 @@ impl ThriftCompactOutputProtocol { } } -pub(crate) trait WriteThrift { +pub(crate) trait WriteThrift { const ELEMENT_TYPE: ElementType; // used to write generated enums and structs - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()>; + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()>; } -impl WriteThrift for Vec +impl WriteThrift for Vec where - T: WriteThrift, + T: WriteThrift, { const ELEMENT_TYPE: ElementType = ElementType::List; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_list_begin(T::ELEMENT_TYPE, self.len())?; for item in self { item.write_thrift(writer)?; @@ -672,82 +672,82 @@ where } } -impl WriteThrift for bool { +impl WriteThrift for bool { const ELEMENT_TYPE: ElementType = ElementType::Bool; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_bool(*self) } } -impl WriteThrift for i8 { +impl WriteThrift for i8 { const ELEMENT_TYPE: ElementType = ElementType::Byte; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_i8(*self) } } -impl WriteThrift for i16 { +impl WriteThrift for i16 { const ELEMENT_TYPE: ElementType = ElementType::I16; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_i16(*self) } } -impl WriteThrift for i32 { +impl WriteThrift for i32 { const ELEMENT_TYPE: ElementType = ElementType::I32; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_i32(*self) } } -impl WriteThrift for i64 { +impl WriteThrift for i64 { const ELEMENT_TYPE: ElementType = ElementType::I64; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_i64(*self) } } -impl WriteThrift for OrderedF64 { +impl WriteThrift for OrderedF64 { const ELEMENT_TYPE: ElementType = ElementType::Double; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_double(self.0) } } -impl WriteThrift for &[u8] { +impl WriteThrift for &[u8] { const ELEMENT_TYPE: ElementType = ElementType::Binary; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_bytes(self) } } -impl WriteThrift for &str { +impl WriteThrift for &str { const ELEMENT_TYPE: ElementType = ElementType::Binary; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_bytes(self.as_bytes()) } } -impl WriteThrift for String { +impl WriteThrift for String { const ELEMENT_TYPE: ElementType = ElementType::Binary; - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { writer.write_bytes(self.as_bytes()) } } -pub(crate) trait WriteThriftField { +pub(crate) trait WriteThriftField { // used to write struct fields (which may be basic types or generated types). // write the field header and field value. returns `field_id`. - fn write_thrift_field( + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -755,8 +755,8 @@ pub(crate) trait WriteThriftField { ) -> Result; } -impl WriteThriftField for bool { - fn write_thrift_field( +impl WriteThriftField for bool { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -771,8 +771,8 @@ impl WriteThriftField for bool { } } -impl WriteThriftField for i8 { - fn write_thrift_field( +impl WriteThriftField for i8 { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -784,8 +784,8 @@ impl WriteThriftField for i8 { } } -impl WriteThriftField for i16 { - fn write_thrift_field( +impl WriteThriftField for i16 { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -797,8 +797,8 @@ impl WriteThriftField for i16 { } } -impl WriteThriftField for i32 { - fn write_thrift_field( +impl WriteThriftField for i32 { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -810,8 +810,8 @@ impl WriteThriftField for i32 { } } -impl WriteThriftField for i64 { - fn write_thrift_field( +impl WriteThriftField for i64 { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -823,8 +823,8 @@ impl WriteThriftField for i64 { } } -impl WriteThriftField for OrderedF64 { - fn write_thrift_field( +impl WriteThriftField for OrderedF64 { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -836,8 +836,8 @@ impl WriteThriftField for OrderedF64 { } } -impl WriteThriftField for &[u8] { - fn write_thrift_field( +impl WriteThriftField for &[u8] { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -849,8 +849,8 @@ impl WriteThriftField for &[u8] { } } -impl WriteThriftField for &str { - fn write_thrift_field( +impl WriteThriftField for &str { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -862,8 +862,8 @@ impl WriteThriftField for &str { } } -impl WriteThriftField for String { - fn write_thrift_field( +impl WriteThriftField for String { + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -875,11 +875,11 @@ impl WriteThriftField for String { } } -impl WriteThriftField for Vec +impl WriteThriftField for Vec where - T: WriteThrift, + T: WriteThrift, { - fn write_thrift_field( + fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, @@ -901,7 +901,7 @@ pub(crate) mod tests { pub(crate) fn test_roundtrip(val: T) where T: for<'a> TryFrom<&'a mut ThriftCompactInputProtocol<'a>> - + WriteThrift> + + WriteThrift + PartialEq + Debug, for<'a> >>::Error: Debug, From 057945627d5ef81141382e1ffd3d3ab49d13cb6c Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 28 Aug 2025 18:56:50 -0700 Subject: [PATCH 042/126] finish merge --- parquet/src/parquet_macros.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 222020797817..9405a9b174e4 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -232,8 +232,8 @@ macro_rules! thrift_union { } } - impl $(<$lt>)? WriteThriftField for $identifier $(<$lt>)? { - fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { + impl $(<$lt>)? WriteThriftField for $identifier $(<$lt>)? { + fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; self.write_thrift(writer)?; Ok(field_id) From 04b74f5e73bd2da03b70ee01fce3c874d81f77b1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 28 Aug 2025 20:01:59 -0700 Subject: [PATCH 043/126] stats --- parquet/src/errors.rs | 8 + parquet/src/file/metadata/thrift_gen.rs | 286 ++++++++++++++++-------- parquet/src/parquet_macros.rs | 62 ++++- parquet/src/parquet_thrift.rs | 59 ++++- 4 files changed, 313 insertions(+), 102 deletions(-) diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index 93b2c1b7e028..341ba673c9db 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -19,6 +19,7 @@ use core::num::TryFromIntError; use std::error::Error; +use std::string::FromUtf8Error; use std::{cell, io, result, str}; #[cfg(feature = "arrow")] @@ -118,6 +119,13 @@ impl From for ParquetError { ParquetError::External(Box::new(e)) } } + +impl From for ParquetError { + fn from(e: FromUtf8Error) -> ParquetError { + ParquetError::External(Box::new(e)) + } +} + #[cfg(feature = "arrow")] impl From for ParquetError { fn from(e: ArrowError) -> ParquetError { diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index b656bacc8c7d..85cf27085f85 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -39,7 +39,7 @@ use crate::{ ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, }, schema::types::{parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor}, - thrift_struct, thrift_union, + thrift_struct, thrift_struct_write_impl, thrift_union, util::bit_util::FromBytes, }; #[cfg(feature = "encryption")] @@ -67,103 +67,6 @@ pub(crate) struct SchemaElement<'a> { } ); -thrift_struct!( -pub(crate) struct DataPageHeader { - /// Number of values, including NULLs, in this data page. - /// - /// If a OffsetIndex is present, a page must begin at a row - /// boundary (repetition_level = 0). Otherwise, pages may begin - /// within a row (repetition_level > 0). - 1: required i32 num_values - - /// Encoding used for this data page - 2: required Encoding encoding - - /// Encoding used for definition levels - 3: required Encoding definition_level_encoding; - - /// Encoding used for repetition levels - 4: required Encoding repetition_level_encoding; - - // Optional statistics for the data in this page - // page stats are pretty useless...lets ignore them - //5: optional Statistics statistics; -} -); - -thrift_struct!( - pub(crate) struct IndexPageHeader {} -); - -thrift_struct!( -pub(crate) struct DictionaryPageHeader { - /// Number of values in the dictionary - 1: required i32 num_values; - - /// Encoding using this dictionary page - 2: required Encoding encoding - - /// If true, the entries in the dictionary are sorted in ascending order - 3: optional bool is_sorted; -} -); - -thrift_struct!( -pub(crate) struct DataPageHeaderV2 { - /// Number of values, including NULLs, in this data page. - 1: required i32 num_values - /// Number of NULL values, in this data page. - /// Number of non-null = num_values - num_nulls which is also the number of values in the data section - 2: required i32 num_nulls - /// Number of rows in this data page. Every page must begin at a - /// row boundary (repetition_level = 0): rows must **not** be - /// split across page boundaries when using V2 data pages. - 3: required i32 num_rows - /// Encoding used for data in this page - 4: required Encoding encoding - - // repetition levels and definition levels are always using RLE (without size in it) - - /// Length of the definition levels - 5: required i32 definition_levels_byte_length; - /// Length of the repetition levels - 6: required i32 repetition_levels_byte_length; - - /// Whether the values are compressed. - /// Which means the section of the page between - /// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) - /// is compressed with the compression_codec. - /// If missing it is considered compressed - 7: optional bool is_compressed = true; - - // Optional statistics for the data in this page - //8: optional Statistics statistics; -} -); - -thrift_struct!( -#[allow(dead_code)] -pub(crate) struct PageHeader { - /// the type of the page: indicates which of the *_header fields is set - 1: required PageType type_ - - /// Uncompressed page size in bytes (not including this header) - 2: required i32 uncompressed_page_size - - /// Compressed (and potentially encrypted) page size in bytes, not including this header - 3: required i32 compressed_page_size - - /// The 32-bit CRC checksum for the page, to be be calculated as follows: - 4: optional i32 crc - - // Headers for page specific data. One only will be set. - 5: optional DataPageHeader data_page_header; - 6: optional IndexPageHeader index_page_header; - 7: optional DictionaryPageHeader dictionary_page_header; - 8: optional DataPageHeaderV2 data_page_header_v2; -} -); - thrift_struct!( pub(crate) struct AesGcmV1<'a> { /// AAD prefix @@ -909,6 +812,191 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ParquetMetaDat } } +// page header stuff. this is partially hand coded so we can avoid parsing the page statistics. +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct DataPageHeader<'a> { + pub(crate) num_values: i32, + pub(crate) encoding: Encoding, + pub(crate) definition_level_encoding: Encoding, + pub(crate) repetition_level_encoding: Encoding, + // this will only be used on write + pub(crate) statistics: Option>, +} + +thrift_struct_write_impl!( +struct DataPageHeader<'a> { + 1: required i32 num_values + 2: required Encoding encoding + 3: required Encoding definition_level_encoding; + 4: required Encoding repetition_level_encoding; + 5: optional Statistics<'a> statistics; +} +); + +// read data page header but skip statistics +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for DataPageHeader<'a> { + fn read_thrift(prot: &mut R) -> Result { + let mut num_values: Option = None; + let mut encoding: Option = None; + let mut definition_level_encoding: Option = None; + let mut repetition_level_encoding: Option = None; + + let mut last_field_id = 0i16; + loop { + let field_ident = prot.read_field_begin(last_field_id)?; + if field_ident.field_type == FieldType::Stop { + break; + } + match field_ident.id { + 1 => num_values = Some(prot.read_i32()?), + 2 => encoding = Some(Encoding::read_thrift(prot)?), + 3 => definition_level_encoding = Some(Encoding::read_thrift(prot)?), + 4 => repetition_level_encoding = Some(Encoding::read_thrift(prot)?), + _ => { + prot.skip(field_ident.field_type)?; + } + }; + last_field_id = field_ident.id; + } + + let num_values = num_values.expect("Required field num_values is missing"); + let encoding = encoding.expect("Required field encoding is missing"); + let definition_level_encoding = + definition_level_encoding.expect("Required field definition_level_encoding is missing"); + let repetition_level_encoding = + repetition_level_encoding.expect("Required field repetition_level_encoding is missing"); + + Ok(Self { + num_values, + encoding, + definition_level_encoding, + repetition_level_encoding, + statistics: None, + }) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct DataPageHeaderV2<'a> { + pub(crate) num_values: i32, + pub(crate) num_nulls: i32, + pub(crate) num_rows: i32, + pub(crate) encoding: Encoding, + pub(crate) definition_levels_byte_length: i32, + pub(crate) repetition_levels_byte_length: i32, + pub(crate) is_compressed: Option, + // this will only be used on write + pub(crate) statistics: Option>, +} + +thrift_struct_write_impl!( +struct DataPageHeaderV2<'a> { + 1: required i32 num_values + 2: required i32 num_nulls + 3: required i32 num_rows + 4: required Encoding encoding + 5: required i32 definition_levels_byte_length; + 6: required i32 repetition_levels_byte_length; + 7: optional bool is_compressed = true; + 8: optional Statistics<'a> statistics; +} +); + +// read data page header but skip statistics +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for DataPageHeaderV2<'a> { + fn read_thrift(prot: &mut R) -> Result { + let mut num_values: Option = None; + let mut num_nulls: Option = None; + let mut num_rows: Option = None; + let mut encoding: Option = None; + let mut definition_levels_byte_length: Option = None; + let mut repetition_levels_byte_length: Option = None; + let mut is_compressed: Option = Some(true); + + let mut last_field_id = 0i16; + loop { + let field_ident = prot.read_field_begin(last_field_id)?; + if field_ident.field_type == FieldType::Stop { + break; + } + match field_ident.id { + 1 => num_values = Some(prot.read_i32()?), + 2 => num_nulls = Some(prot.read_i32()?), + 3 => num_rows = Some(prot.read_i32()?), + 4 => encoding = Some(Encoding::read_thrift(prot)?), + 5 => definition_levels_byte_length = Some(prot.read_i32()?), + 6 => repetition_levels_byte_length = Some(prot.read_i32()?), + 7 => is_compressed = field_ident.bool_val, + _ => { + prot.skip(field_ident.field_type)?; + } + }; + last_field_id = field_ident.id; + } + + let num_values = num_values.expect("Required field num_values is missing"); + let num_nulls = num_nulls.expect("Required field num_nulls is missing"); + let num_rows = num_rows.expect("Required field num_rows is missing"); + let encoding = encoding.expect("Required field encoding is missing"); + let definition_levels_byte_length = definition_levels_byte_length + .expect("Required field definition_levels_byte_length is missing"); + let repetition_levels_byte_length = repetition_levels_byte_length + .expect("Required field repetition_levels_byte_length is missing"); + + Ok(Self { + num_values, + num_nulls, + num_rows, + encoding, + definition_levels_byte_length, + repetition_levels_byte_length, + is_compressed, + statistics: None, + }) + } +} + +thrift_struct!( + pub(crate) struct IndexPageHeader {} +); + +thrift_struct!( +pub(crate) struct DictionaryPageHeader { + /// Number of values in the dictionary + 1: required i32 num_values; + + /// Encoding using this dictionary page + 2: required Encoding encoding + + /// If true, the entries in the dictionary are sorted in ascending order + 3: optional bool is_sorted; +} +); + +thrift_struct!( +#[allow(dead_code)] +pub(crate) struct PageHeader<'a> { + /// the type of the page: indicates which of the *_header fields is set + 1: required PageType type_ + + /// Uncompressed page size in bytes (not including this header) + 2: required i32 uncompressed_page_size + + /// Compressed (and potentially encrypted) page size in bytes, not including this header + 3: required i32 compressed_page_size + + /// The 32-bit CRC checksum for the page, to be be calculated as follows: + 4: optional i32 crc + + // Headers for page specific data. One only will be set. + 5: optional DataPageHeader<'a> data_page_header; + 6: optional IndexPageHeader index_page_header; + 7: optional DictionaryPageHeader dictionary_page_header; + 8: optional DataPageHeaderV2<'a> data_page_header_v2; +} +); + + #[cfg(test)] mod tests { use crate::file::metadata::thrift_gen::BoundingBox; @@ -949,4 +1037,4 @@ mod tests { mmax: Some(42.0.into()), }); } -} +} \ No newline at end of file diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index f02d1470f72e..ab18e5199bea 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -319,6 +319,66 @@ macro_rules! thrift_struct { } } +/// only implements ReadThrift for the give IDL struct definition +#[macro_export] +macro_rules! thrift_struct_read_impl { + ($(#[$($def_attrs:tt)*])* $vis:vis struct $identifier:ident $(< $lt:lifetime >)? { $($(#[$($field_attrs:tt)*])* $field_id:literal : $required_or_optional:ident $field_type:ident $(< $field_lt:lifetime >)? $(< $element_type:ident >)? $field_name:ident $(= $default_value:literal)? $(;)?)* }) => { + $(#[cfg_attr(not(doctest), $($def_attrs)*)])* + impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier $(<$lt>)? { + fn read_thrift(prot: &mut R) -> Result { + $(let mut $field_name: Option<$crate::__thrift_field_type!($field_type $($field_lt)? $($element_type)?)> = None;)* + let mut last_field_id = 0i16; + loop { + let field_ident = prot.read_field_begin(last_field_id)?; + if field_ident.field_type == FieldType::Stop { + break; + } + match field_ident.id { + $($field_id => { + let val = $crate::__thrift_read_field!(prot, field_ident, $field_type $($field_lt)? $($element_type)?); + $field_name = Some(val); + })* + _ => { + prot.skip(field_ident.field_type)?; + } + }; + last_field_id = field_ident.id; + } + $($crate::__thrift_result_required_or_optional!($required_or_optional $field_name);)* + Ok(Self { + $($field_name),* + }) + } + } + } +} + +/// only implements WriteThrift for the give IDL struct definition +#[macro_export] +macro_rules! thrift_struct_write_impl { + ($(#[$($def_attrs:tt)*])* $vis:vis struct $identifier:ident $(< $lt:lifetime >)? { $($(#[$($field_attrs:tt)*])* $field_id:literal : $required_or_optional:ident $field_type:ident $(< $field_lt:lifetime >)? $(< $element_type:ident >)? $field_name:ident $(= $default_value:literal)? $(;)?)* }) => { + impl $(<$lt>)? WriteThrift for $identifier $(<$lt>)? { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + #[allow(unused_assignments)] + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + #[allow(unused_mut, unused_variables)] + let mut last_field_id = 0i16; + $($crate::__thrift_write_required_or_optional_field!($required_or_optional $field_name, $field_id, $field_type, self, writer, last_field_id);)* + writer.write_struct_end() + } + } + + impl $(<$lt>)? WriteThriftField for $identifier $(<$lt>)? { + fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { + writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } + } + } +} + #[doc(hidden)] #[macro_export] macro_rules! __thrift_write_required_or_optional_field { @@ -429,7 +489,7 @@ macro_rules! __thrift_read_field { }; ($prot:tt, $field_ident:tt, binary) => { // this one needs to not conflict with `list` - $prot.read_bytes()?.to_vec() + $prot.read_bytes_owned()? }; ($prot:tt, $field_ident:tt, double) => { $crate::parquet_thrift::OrderedF64::read_thrift(&mut *$prot)? diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index ded1db711be0..ffa5409ac114 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -20,7 +20,10 @@ // to not allocate byte arrays or strings. #![allow(dead_code)] -use std::{cmp::Ordering, io::Write}; +use std::{ + cmp::Ordering, + io::{Read, Write}, +}; use crate::errors::{ParquetError, Result}; @@ -173,6 +176,8 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { fn read_bytes(&mut self) -> Result<&'a [u8]>; + fn read_bytes_owned(&mut self) -> Result>; + fn skip_bytes(&mut self, n: usize) -> Result<()>; fn read_vlq(&mut self) -> Result { @@ -412,6 +417,10 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'b> for ThriftSliceInputProtocol<'a> Ok(ret) } + fn read_bytes_owned(&mut self) -> Result> { + Ok(self.read_bytes()?.to_vec()) + } + #[inline] fn skip_bytes(&mut self, n: usize) -> Result<()> { self.buf.get(..n).ok_or_else(eof_error)?; @@ -433,6 +442,52 @@ fn eof_error() -> ParquetError { eof_err!("Unexpected EOF") } +// input protocol that's only intended for use in reading page headers. not fully implemented +// so this shouldn't be used generally. +pub(crate) struct ThriftReadInputProtocol { + reader: R, +} + +impl ThriftReadInputProtocol { + pub(crate) fn new(reader: R) -> Self { + Self { reader } + } +} + +impl<'a, R: Read> ThriftCompactInputProtocol<'a> for ThriftReadInputProtocol { + #[inline] + fn read_byte(&mut self) -> Result { + let mut buf = [0_u8; 1]; + self.reader.read_exact(&mut buf)?; + Ok(buf[0]) + } + + fn read_bytes(&mut self) -> Result<&'a [u8]> { + unimplemented!() + } + + fn read_bytes_owned(&mut self) -> Result> { + let len = self.read_vlq()? as usize; + let mut v = Vec::with_capacity(len); + std::io::copy(&mut self.reader.by_ref().take(len as u64), &mut v)?; + Ok(v) + } + + fn skip_bytes(&mut self, n: usize) -> Result<()> { + std::io::copy( + &mut self.reader.by_ref().take(n as u64), + &mut std::io::sink(), + )?; + Ok(()) + } + + fn read_double(&mut self) -> Result { + let mut buf = [0_u8; 8]; + self.reader.read_exact(&mut buf)?; + Ok(f64::from_le_bytes(buf)) + } +} + pub(crate) trait ReadThrift<'a, R: ThriftCompactInputProtocol<'a>> { // used to read generated enums and structs fn read_thrift(prot: &mut R) -> Result @@ -484,7 +539,7 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for &'a str { impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for String { fn read_thrift(prot: &mut R) -> Result { - Ok(prot.read_string()?.to_owned()) + Ok(String::from_utf8(prot.read_bytes_owned()?)?) } } From 2250e18023c8cd10796dff0548db55b3f681c001 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 28 Aug 2025 22:18:52 -0700 Subject: [PATCH 044/126] get new page headers working for read and write some tests fail still because we no longer read page stats --- parquet/src/arrow/arrow_writer/mod.rs | 19 +- parquet/src/column/page.rs | 32 +-- parquet/src/column/page_encryption.rs | 10 +- .../src/column/page_encryption_disabled.rs | 4 +- parquet/src/column/writer/mod.rs | 7 +- parquet/src/encryption/encrypt.rs | 28 +++ parquet/src/file/metadata/thrift_gen.rs | 204 +++++++----------- parquet/src/file/serialized_reader.rs | 69 +++--- parquet/src/file/statistics.rs | 50 +++++ parquet/src/file/writer.rs | 10 +- parquet/src/parquet_thrift.rs | 5 +- 11 files changed, 224 insertions(+), 214 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index bd9f30c36103..598b324157f2 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -23,7 +23,6 @@ use std::iter::Peekable; use std::slice::Iter; use std::sync::{Arc, Mutex}; use std::vec::IntoIter; -use thrift::protocol::TCompactOutputProtocol; use arrow_array::cast::AsArray; use arrow_array::types::*; @@ -48,8 +47,8 @@ use crate::file::metadata::{KeyValue, RowGroupMetaData}; use crate::file::properties::{WriterProperties, WriterPropertiesPtr}; use crate::file::reader::{ChunkReader, Length}; use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter}; +use crate::parquet_thrift::{ThriftCompactOutputProtocol, WriteThrift}; use crate::schema::types::{ColumnDescPtr, SchemaDescriptor}; -use crate::thrift::TSerializable; use levels::{calculate_array_levels, ArrayLevels}; mod byte_array; @@ -583,8 +582,8 @@ impl PageWriter for ArrowPageWriter { } } None => { - let mut protocol = TCompactOutputProtocol::new(&mut header); - page_header.write_to_out_protocol(&mut protocol)?; + let mut protocol = ThriftCompactOutputProtocol::new(&mut header); + page_header.write_thrift(&mut protocol)?; } }; @@ -1487,12 +1486,14 @@ mod tests { use crate::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder}; use crate::arrow::ARROW_SCHEMA_META_KEY; use crate::column::page::{Page, PageReader}; + use crate::file::metadata::thrift_gen::PageHeaderWithStats; use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::reader::SerializedPageReader; use crate::format::PageHeader; + use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; use crate::schema::types::ColumnPath; - use crate::thrift::TCompactSliceInputProtocol; + use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use arrow::datatypes::ToByteSlice; use arrow::datatypes::{DataType, Schema}; use arrow::error::Result as ArrowResult; @@ -4191,8 +4192,8 @@ mod tests { // decode first page header let first_page = &buf[4..]; - let mut prot = TCompactSliceInputProtocol::new(first_page); - let hdr = PageHeader::read_from_in_protocol(&mut prot).unwrap(); + let mut prot = ThriftSliceInputProtocol::new(first_page); + let hdr = PageHeaderWithStats::read_thrift(&mut prot).unwrap(); let stats = hdr.data_page_header.unwrap().statistics; assert!(stats.is_none()); @@ -4225,8 +4226,8 @@ mod tests { // decode first page header let first_page = &buf[4..]; - let mut prot = TCompactSliceInputProtocol::new(first_page); - let hdr = PageHeader::read_from_in_protocol(&mut prot).unwrap(); + let mut prot = ThriftSliceInputProtocol::new(first_page); + let hdr = PageHeaderWithStats::read_thrift(&mut prot).unwrap(); let stats = hdr.data_page_header.unwrap().statistics; let stats = stats.unwrap(); diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 1dabe6794f07..556428fc9690 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -21,8 +21,10 @@ use bytes::Bytes; use crate::basic::{Encoding, PageType}; use crate::errors::{ParquetError, Result}; -use crate::file::statistics::Statistics; -use crate::format::PageHeader; +use crate::file::metadata::thrift_gen::{ + DataPageHeaderV2WithStats, DataPageHeaderWithStats, DictionaryPageHeader, PageHeaderWithStats, +}; +use crate::file::statistics::{page_stats_to_thrift, Statistics}; /// Parquet Page definition. /// @@ -196,14 +198,14 @@ impl CompressedPage { } /// Returns the thrift page header - pub(crate) fn to_thrift_header(&self) -> PageHeader { + pub(crate) fn to_thrift_header(&self) -> PageHeaderWithStats { let uncompressed_size = self.uncompressed_size(); let compressed_size = self.compressed_size(); let num_values = self.num_values(); let encoding = self.encoding(); let page_type = self.page_type(); - let mut page_header = PageHeader { + let mut page_header = PageHeaderWithStats { type_: page_type.into(), uncompressed_page_size: uncompressed_size as i32, compressed_page_size: compressed_size as i32, @@ -222,12 +224,12 @@ impl CompressedPage { ref statistics, .. } => { - let data_page_header = crate::format::DataPageHeader { + let data_page_header = DataPageHeaderWithStats { num_values: num_values as i32, encoding: encoding.into(), definition_level_encoding: def_level_encoding.into(), repetition_level_encoding: rep_level_encoding.into(), - statistics: crate::file::statistics::to_thrift(statistics.as_ref()), + statistics: page_stats_to_thrift(statistics.as_ref()), }; page_header.data_page_header = Some(data_page_header); } @@ -240,7 +242,7 @@ impl CompressedPage { ref statistics, .. } => { - let data_page_header_v2 = crate::format::DataPageHeaderV2 { + let data_page_header_v2 = DataPageHeaderV2WithStats { num_values: num_values as i32, num_nulls: num_nulls as i32, num_rows: num_rows as i32, @@ -248,12 +250,12 @@ impl CompressedPage { definition_levels_byte_length: def_levels_byte_len as i32, repetition_levels_byte_length: rep_levels_byte_len as i32, is_compressed: Some(is_compressed), - statistics: crate::file::statistics::to_thrift(statistics.as_ref()), + statistics: page_stats_to_thrift(statistics.as_ref()), }; page_header.data_page_header_v2 = Some(data_page_header_v2); } Page::DictionaryPage { is_sorted, .. } => { - let dictionary_page_header = crate::format::DictionaryPageHeader { + let dictionary_page_header = DictionaryPageHeader { num_values: num_values as i32, encoding: encoding.into(), is_sorted: Some(is_sorted), @@ -331,12 +333,14 @@ pub struct PageMetadata { pub is_dict: bool, } -impl TryFrom<&PageHeader> for PageMetadata { +impl TryFrom<&crate::file::metadata::thrift_gen::PageHeader> for PageMetadata { type Error = ParquetError; - fn try_from(value: &PageHeader) -> std::result::Result { + fn try_from( + value: &crate::file::metadata::thrift_gen::PageHeader, + ) -> std::result::Result { match value.type_ { - crate::format::PageType::DATA_PAGE => { + PageType::DATA_PAGE => { let header = value.data_page_header.as_ref().unwrap(); Ok(PageMetadata { num_rows: None, @@ -344,12 +348,12 @@ impl TryFrom<&PageHeader> for PageMetadata { is_dict: false, }) } - crate::format::PageType::DICTIONARY_PAGE => Ok(PageMetadata { + PageType::DICTIONARY_PAGE => Ok(PageMetadata { num_rows: None, num_levels: None, is_dict: true, }), - crate::format::PageType::DATA_PAGE_V2 => { + PageType::DATA_PAGE_V2 => { let header = value.data_page_header_v2.as_ref().unwrap(); Ok(PageMetadata { num_rows: Some(header.num_rows as _), diff --git a/parquet/src/column/page_encryption.rs b/parquet/src/column/page_encryption.rs index 0fb7c8942675..a491778a065c 100644 --- a/parquet/src/column/page_encryption.rs +++ b/parquet/src/column/page_encryption.rs @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. +use crate::basic::PageType; use crate::column::page::CompressedPage; use crate::encryption::ciphers::BlockEncryptor; -use crate::encryption::encrypt::{encrypt_object, FileEncryptor}; +use crate::encryption::encrypt::{encrypt_thrift_object, FileEncryptor}; use crate::encryption::modules::{create_module_aad, ModuleType}; use crate::errors::ParquetError; use crate::errors::Result; -use crate::format::PageHeader; -use crate::format::PageType; +use crate::file::metadata::thrift_gen::PageHeaderWithStats; use bytes::Bytes; use std::io::Write; use std::sync::Arc; @@ -92,7 +92,7 @@ impl PageEncryptor { /// Encrypt a column page header pub fn encrypt_page_header( &mut self, - page_header: &PageHeader, + page_header: &PageHeaderWithStats, sink: &mut W, ) -> Result<()> { let module_type = match page_header.type_ { @@ -114,6 +114,6 @@ impl PageEncryptor { Some(self.page_index), )?; - encrypt_object(page_header, &mut self.block_encryptor, sink, &aad) + encrypt_thrift_object(page_header, &mut self.block_encryptor, sink, &aad) } } diff --git a/parquet/src/column/page_encryption_disabled.rs b/parquet/src/column/page_encryption_disabled.rs index e85b0281168a..a028881f5b51 100644 --- a/parquet/src/column/page_encryption_disabled.rs +++ b/parquet/src/column/page_encryption_disabled.rs @@ -17,7 +17,7 @@ use crate::column::page::CompressedPage; use crate::errors::Result; -use crate::format::PageHeader; +use crate::file::metadata::thrift_gen::PageHeaderWithStats; use std::io::Write; #[derive(Debug)] @@ -36,7 +36,7 @@ impl PageEncryptor { pub fn encrypt_page_header( &mut self, - _page_header: &PageHeader, + _page_header: &PageHeaderWithStats, _sink: &mut W, ) -> Result<()> { unreachable!("The encryption feature is disabled") diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 1e6f4f6f0706..33fd6285c7c4 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2266,10 +2266,13 @@ mod tests { let pages = reader.collect::>>().unwrap(); assert_eq!(pages.len(), 2); + /* TODO(ets): need to seek to the data page and decode ourselves since + * the stats are no longer read + */ assert_eq!(pages[0].page_type(), PageType::DICTIONARY_PAGE); assert_eq!(pages[1].page_type(), PageType::DATA_PAGE); - let page_statistics = pages[1].statistics().unwrap(); + /*let page_statistics = pages[1].statistics().unwrap(); assert_eq!( page_statistics.min_bytes_opt().unwrap(), 1_i32.to_le_bytes() @@ -2279,7 +2282,7 @@ mod tests { 7_i32.to_le_bytes() ); assert_eq!(page_statistics.null_count_opt(), Some(0)); - assert!(page_statistics.distinct_count_opt().is_none()); + assert!(page_statistics.distinct_count_opt().is_none());*/ } #[test] diff --git a/parquet/src/encryption/encrypt.rs b/parquet/src/encryption/encrypt.rs index 1a241bf7b170..97893021699e 100644 --- a/parquet/src/encryption/encrypt.rs +++ b/parquet/src/encryption/encrypt.rs @@ -22,6 +22,7 @@ use crate::encryption::ciphers::{ }; use crate::errors::{ParquetError, Result}; use crate::file::column_crypto_metadata::{ColumnCryptoMetaData, EncryptionWithColumnKey}; +use crate::parquet_thrift::{ThriftCompactOutputProtocol, WriteThrift}; use crate::schema::types::{ColumnDescPtr, SchemaDescriptor}; use crate::thrift::TSerializable; use ring::rand::{SecureRandom, SystemRandom}; @@ -376,6 +377,18 @@ pub(crate) fn encrypt_object( Ok(()) } +/// Write an encrypted Thrift serializable object +pub(crate) fn encrypt_thrift_object( + object: &T, + encryptor: &mut Box, + sink: &mut W, + module_aad: &[u8], +) -> Result<()> { + let encrypted_buffer = encrypt_thrift_object_to_vec(object, encryptor, module_aad)?; + sink.write_all(&encrypted_buffer)?; + Ok(()) +} + pub(crate) fn write_signed_plaintext_object( object: &T, encryptor: &mut Box, @@ -414,6 +427,21 @@ pub(crate) fn encrypt_object_to_vec( encryptor.encrypt(buffer.as_ref(), module_aad) } +/// Encrypt a Thrift serializable object to a byte vector +pub(crate) fn encrypt_thrift_object_to_vec( + object: &T, + encryptor: &mut Box, + module_aad: &[u8], +) -> Result> { + let mut buffer: Vec = vec![]; + { + let mut unencrypted_protocol = ThriftCompactOutputProtocol::new(&mut buffer); + object.write_thrift(&mut unencrypted_protocol)?; + } + + encryptor.encrypt(buffer.as_ref(), module_aad) +} + /// Get the crypto metadata for a column from the file encryption properties pub(crate) fn get_column_crypto_metadata( properties: &FileEncryptionProperties, diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 85cf27085f85..ceb631519ee4 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -39,7 +39,7 @@ use crate::{ ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, }, schema::types::{parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor}, - thrift_struct, thrift_struct_write_impl, thrift_union, + thrift_struct, thrift_union, util::bit_util::FromBytes, }; #[cfg(feature = "encryption")] @@ -812,85 +812,21 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ParquetMetaDat } } -// page header stuff. this is partially hand coded so we can avoid parsing the page statistics. -#[derive(Clone, Debug, Eq, PartialEq)] -pub(crate) struct DataPageHeader<'a> { - pub(crate) num_values: i32, - pub(crate) encoding: Encoding, - pub(crate) definition_level_encoding: Encoding, - pub(crate) repetition_level_encoding: Encoding, - // this will only be used on write - pub(crate) statistics: Option>, -} - -thrift_struct_write_impl!( -struct DataPageHeader<'a> { +// the following structures are only meant for reading. the +// statistics have been removed from the data page headers since +// we don't ever use those stats, but they take an inordinate +// amount of time to decode. +thrift_struct!( +pub(crate) struct DataPageHeader { 1: required i32 num_values 2: required Encoding encoding 3: required Encoding definition_level_encoding; 4: required Encoding repetition_level_encoding; - 5: optional Statistics<'a> statistics; } ); -// read data page header but skip statistics -impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for DataPageHeader<'a> { - fn read_thrift(prot: &mut R) -> Result { - let mut num_values: Option = None; - let mut encoding: Option = None; - let mut definition_level_encoding: Option = None; - let mut repetition_level_encoding: Option = None; - - let mut last_field_id = 0i16; - loop { - let field_ident = prot.read_field_begin(last_field_id)?; - if field_ident.field_type == FieldType::Stop { - break; - } - match field_ident.id { - 1 => num_values = Some(prot.read_i32()?), - 2 => encoding = Some(Encoding::read_thrift(prot)?), - 3 => definition_level_encoding = Some(Encoding::read_thrift(prot)?), - 4 => repetition_level_encoding = Some(Encoding::read_thrift(prot)?), - _ => { - prot.skip(field_ident.field_type)?; - } - }; - last_field_id = field_ident.id; - } - - let num_values = num_values.expect("Required field num_values is missing"); - let encoding = encoding.expect("Required field encoding is missing"); - let definition_level_encoding = - definition_level_encoding.expect("Required field definition_level_encoding is missing"); - let repetition_level_encoding = - repetition_level_encoding.expect("Required field repetition_level_encoding is missing"); - - Ok(Self { - num_values, - encoding, - definition_level_encoding, - repetition_level_encoding, - statistics: None, - }) - } -} - -#[derive(Clone, Debug, Eq, PartialEq)] -pub(crate) struct DataPageHeaderV2<'a> { - pub(crate) num_values: i32, - pub(crate) num_nulls: i32, - pub(crate) num_rows: i32, - pub(crate) encoding: Encoding, - pub(crate) definition_levels_byte_length: i32, - pub(crate) repetition_levels_byte_length: i32, - pub(crate) is_compressed: Option, - // this will only be used on write - pub(crate) statistics: Option>, -} - -thrift_struct_write_impl!( -struct DataPageHeaderV2<'a> { +thrift_struct!( +pub(crate) struct DataPageHeaderV2 { 1: required i32 num_values 2: required i32 num_nulls 3: required i32 num_rows @@ -898,64 +834,9 @@ struct DataPageHeaderV2<'a> { 5: required i32 definition_levels_byte_length; 6: required i32 repetition_levels_byte_length; 7: optional bool is_compressed = true; - 8: optional Statistics<'a> statistics; } ); -// read data page header but skip statistics -impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for DataPageHeaderV2<'a> { - fn read_thrift(prot: &mut R) -> Result { - let mut num_values: Option = None; - let mut num_nulls: Option = None; - let mut num_rows: Option = None; - let mut encoding: Option = None; - let mut definition_levels_byte_length: Option = None; - let mut repetition_levels_byte_length: Option = None; - let mut is_compressed: Option = Some(true); - - let mut last_field_id = 0i16; - loop { - let field_ident = prot.read_field_begin(last_field_id)?; - if field_ident.field_type == FieldType::Stop { - break; - } - match field_ident.id { - 1 => num_values = Some(prot.read_i32()?), - 2 => num_nulls = Some(prot.read_i32()?), - 3 => num_rows = Some(prot.read_i32()?), - 4 => encoding = Some(Encoding::read_thrift(prot)?), - 5 => definition_levels_byte_length = Some(prot.read_i32()?), - 6 => repetition_levels_byte_length = Some(prot.read_i32()?), - 7 => is_compressed = field_ident.bool_val, - _ => { - prot.skip(field_ident.field_type)?; - } - }; - last_field_id = field_ident.id; - } - - let num_values = num_values.expect("Required field num_values is missing"); - let num_nulls = num_nulls.expect("Required field num_nulls is missing"); - let num_rows = num_rows.expect("Required field num_rows is missing"); - let encoding = encoding.expect("Required field encoding is missing"); - let definition_levels_byte_length = definition_levels_byte_length - .expect("Required field definition_levels_byte_length is missing"); - let repetition_levels_byte_length = repetition_levels_byte_length - .expect("Required field repetition_levels_byte_length is missing"); - - Ok(Self { - num_values, - num_nulls, - num_rows, - encoding, - definition_levels_byte_length, - repetition_levels_byte_length, - is_compressed, - statistics: None, - }) - } -} - thrift_struct!( pub(crate) struct IndexPageHeader {} ); @@ -975,7 +856,7 @@ pub(crate) struct DictionaryPageHeader { thrift_struct!( #[allow(dead_code)] -pub(crate) struct PageHeader<'a> { +pub(crate) struct PageHeader { /// the type of the page: indicates which of the *_header fields is set 1: required PageType type_ @@ -989,13 +870,72 @@ pub(crate) struct PageHeader<'a> { 4: optional i32 crc // Headers for page specific data. One only will be set. - 5: optional DataPageHeader<'a> data_page_header; + 5: optional DataPageHeader data_page_header; 6: optional IndexPageHeader index_page_header; 7: optional DictionaryPageHeader dictionary_page_header; - 8: optional DataPageHeaderV2<'a> data_page_header_v2; + 8: optional DataPageHeaderV2 data_page_header_v2; +} +); + +// these page headers are for the write side...they have statistics that don't require lifetimes +thrift_struct!( +pub(crate) struct PageStatistics { + 1: optional binary max; + 2: optional binary min; + 3: optional i64 null_count; + 4: optional i64 distinct_count; + 5: optional binary max_value; + 6: optional binary min_value; + 7: optional bool is_max_value_exact; + 8: optional bool is_min_value_exact; +} +); + +thrift_struct!( +pub(crate) struct DataPageHeaderWithStats { + 1: required i32 num_values + 2: required Encoding encoding + 3: required Encoding definition_level_encoding; + 4: required Encoding repetition_level_encoding; + 5: optional PageStatistics statistics; +} +); + +thrift_struct!( +pub(crate) struct DataPageHeaderV2WithStats { + 1: required i32 num_values + 2: required i32 num_nulls + 3: required i32 num_rows + 4: required Encoding encoding + 5: required i32 definition_levels_byte_length; + 6: required i32 repetition_levels_byte_length; + 7: optional bool is_compressed = true; + 8: optional PageStatistics statistics; } ); +thrift_struct!( +#[allow(dead_code)] +pub(crate) struct PageHeaderWithStats { + /// the type of the page: indicates which of the *_header fields is set + 1: required PageType type_ + + /// Uncompressed page size in bytes (not including this header) + 2: required i32 uncompressed_page_size + + /// Compressed (and potentially encrypted) page size in bytes, not including this header + 3: required i32 compressed_page_size + + /// The 32-bit CRC checksum for the page, to be be calculated as follows: + 4: optional i32 crc + + // Headers for page specific data. One only will be set. + 5: optional DataPageHeaderWithStats data_page_header; + 6: optional IndexPageHeader index_page_header; + 7: optional DictionaryPageHeader dictionary_page_header; + 8: optional DataPageHeaderV2WithStats data_page_header_v2; +} +); #[cfg(test)] mod tests { @@ -1037,4 +977,4 @@ mod tests { mmax: Some(42.0.into()), }); } -} \ No newline at end of file +} diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 335f0bc3601b..371d0fe72e9c 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -18,31 +18,26 @@ //! Contains implementations of the reader traits FileReader, RowGroupReader and PageReader //! Also contains implementations of the ChunkReader for files (with buffering) and byte arrays (RAM) -use crate::basic::{Encoding, Type}; +use crate::basic::PageType; use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; #[cfg(feature = "encryption")] use crate::encryption::decrypt::{read_and_decrypt, CryptoContext}; use crate::errors::{ParquetError, Result}; +use crate::file::metadata::thrift_gen::PageHeader; use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use crate::file::{ metadata::*, properties::{ReaderProperties, ReaderPropertiesPtr}, reader::*, - statistics, }; -use crate::format::{PageHeader, PageType}; use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; -#[cfg(feature = "encryption")] -use crate::thrift::TCompactSliceInputProtocol; -use crate::thrift::TSerializable; use bytes::Bytes; use std::collections::VecDeque; use std::{fs::File, io::Read, path::Path, sync::Arc}; -use thrift::protocol::TCompactInputProtocol; impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -344,7 +339,6 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R pub(crate) fn decode_page( page_header: PageHeader, buffer: Bytes, - physical_type: Type, decompressor: Option<&mut Box>, ) -> Result { // Verify the 32-bit CRC checksum of the page @@ -423,7 +417,7 @@ pub(crate) fn decode_page( Page::DictionaryPage { buf: buffer, num_values: dict_header.num_values.try_into()?, - encoding: Encoding::try_from(dict_header.encoding)?, + encoding: dict_header.encoding, is_sorted, } } @@ -434,10 +428,10 @@ pub(crate) fn decode_page( Page::DataPage { buf: buffer, num_values: header.num_values.try_into()?, - encoding: Encoding::try_from(header.encoding)?, - def_level_encoding: Encoding::try_from(header.definition_level_encoding)?, - rep_level_encoding: Encoding::try_from(header.repetition_level_encoding)?, - statistics: statistics::from_thrift(physical_type, header.statistics)?, + encoding: header.encoding, + def_level_encoding: header.definition_level_encoding, + rep_level_encoding: header.repetition_level_encoding, + statistics: None, } } PageType::DATA_PAGE_V2 => { @@ -448,13 +442,13 @@ pub(crate) fn decode_page( Page::DataPageV2 { buf: buffer, num_values: header.num_values.try_into()?, - encoding: Encoding::try_from(header.encoding)?, + encoding: header.encoding, num_nulls: header.num_nulls.try_into()?, num_rows: header.num_rows.try_into()?, def_levels_byte_len: header.definition_levels_byte_length.try_into()?, rep_levels_byte_len: header.repetition_levels_byte_length.try_into()?, is_compressed, - statistics: statistics::from_thrift(physical_type, header.statistics)?, + statistics: None, } } _ => { @@ -512,9 +506,6 @@ pub struct SerializedPageReader { /// The compression codec for this column chunk. Only set for non-PLAIN codec. decompressor: Option>, - /// Column chunk type. - physical_type: Type, - state: SerializedPageReaderState, context: SerializedPageReaderContext, @@ -614,7 +605,6 @@ impl SerializedPageReader { reader, decompressor, state, - physical_type: meta.column_type(), context: Default::default(), }) } @@ -732,8 +722,10 @@ impl SerializedPageReaderContext { _page_index: usize, _dictionary_page: bool, ) -> Result { - let mut prot = TCompactInputProtocol::new(input); - Ok(PageHeader::read_from_in_protocol(&mut prot)?) + use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; + + let mut prot = ThriftReadInputProtocol::new(input); + Ok(PageHeader::read_thrift(&mut prot)?) } fn decrypt_page_data( @@ -756,10 +748,14 @@ impl SerializedPageReaderContext { ) -> Result { match self.page_crypto_context(page_index, dictionary_page) { None => { - let mut prot = TCompactInputProtocol::new(input); - Ok(PageHeader::read_from_in_protocol(&mut prot)?) + use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; + + let mut prot = ThriftReadInputProtocol::new(input); + Ok(PageHeader::read_thrift(&mut prot)?) } Some(page_crypto_context) => { + use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; + let data_decryptor = page_crypto_context.data_decryptor(); let aad = page_crypto_context.create_page_header_aad()?; @@ -770,8 +766,8 @@ impl SerializedPageReaderContext { )) })?; - let mut prot = TCompactSliceInputProtocol::new(buf.as_slice()); - Ok(PageHeader::read_from_in_protocol(&mut prot)?) + let mut prot = ThriftSliceInputProtocol::new(buf.as_slice()); + Ok(PageHeader::read_thrift(&mut prot)?) } } } @@ -894,12 +890,8 @@ impl PageReader for SerializedPageReader { self.context .decrypt_page_data(buffer, *page_index, *require_dictionary)?; - let page = decode_page( - header, - Bytes::from(buffer), - self.physical_type, - self.decompressor.as_mut(), - )?; + let page = + decode_page(header, Bytes::from(buffer), self.decompressor.as_mut())?; if page.is_data_page() { *page_index += 1; } else if page.is_dictionary_page() { @@ -938,12 +930,7 @@ impl PageReader for SerializedPageReader { if !is_dictionary_page { *page_index += 1; } - decode_page( - header, - bytes, - self.physical_type, - self.decompressor.as_mut(), - )? + decode_page(header, bytes, self.decompressor.as_mut())? } }; @@ -1107,7 +1094,7 @@ mod tests { }; use crate::file::properties::{EnabledStatistics, WriterProperties}; - use crate::basic::{self, BoundaryOrder, ColumnOrder, SortOrder}; + use crate::basic::{self, BoundaryOrder, ColumnOrder, Encoding, SortOrder}; use crate::column::reader::ColumnReader; use crate::data_type::private::ParquetValueType; use crate::data_type::{AsBytes, FixedLenByteArrayType, Int32Type}; @@ -1396,7 +1383,7 @@ mod tests { assert_eq!(def_levels_byte_len, 2); assert_eq!(rep_levels_byte_len, 0); assert!(is_compressed); - assert!(statistics.is_some()); + assert!(statistics.is_none()); // page stats are no longer read true } _ => false, @@ -1498,7 +1485,7 @@ mod tests { assert_eq!(def_levels_byte_len, 2); assert_eq!(rep_levels_byte_len, 0); assert!(is_compressed); - assert!(statistics.is_some()); + assert!(statistics.is_none()); // page stats are no longer read true } _ => false, @@ -1877,7 +1864,7 @@ mod tests { let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data)); assert_eq!( ret.err().unwrap().to_string(), - "Parquet error: Could not parse metadata: Parquet error: Received empty union from remote ColumnOrder" + "Parquet error: Received empty union from remote ColumnOrder" ); } diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index d4501830ac40..674b055f34ed 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -45,6 +45,7 @@ use crate::basic::Type; use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::errors::{ParquetError, Result}; +use crate::file::metadata::thrift_gen::PageStatistics; use crate::util::bit_util::FromBytes; pub(crate) mod private { @@ -315,6 +316,55 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option) -> Option { + let stats = stats?; + + // record null count if it can fit in i64 + let null_count = stats + .null_count_opt() + .and_then(|value| i64::try_from(value).ok()); + + // record distinct count if it can fit in i64 + let distinct_count = stats + .distinct_count_opt() + .and_then(|value| i64::try_from(value).ok()); + + let mut thrift_stats = PageStatistics { + max: None, + min: None, + null_count, + distinct_count, + max_value: None, + min_value: None, + is_max_value_exact: None, + is_min_value_exact: None, + }; + + // Get min/max if set. + let (min, max, min_exact, max_exact) = ( + stats.min_bytes_opt().map(|x| x.to_vec()), + stats.max_bytes_opt().map(|x| x.to_vec()), + Some(stats.min_is_exact()), + Some(stats.max_is_exact()), + ); + if stats.is_min_max_backwards_compatible() { + // Copy to deprecated min, max values for compatibility with older readers + thrift_stats.min.clone_from(&min); + thrift_stats.max.clone_from(&max); + } + + if !stats.is_min_max_deprecated() { + thrift_stats.min_value = min; + thrift_stats.max_value = max; + } + + thrift_stats.is_min_value_exact = min_exact; + thrift_stats.is_max_value_exact = max_exact; + + Some(thrift_stats) +} + /// Strongly typed statistics for a column chunk within a row group. /// /// This structure is a natively typed, in memory representation of the diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 65b96246ea03..c62b7295a771 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -19,13 +19,13 @@ //! using row group writers and column writers respectively. use crate::bloom_filter::Sbbf; +use crate::file::metadata::thrift_gen::PageHeaderWithStats; use crate::file::page_index::index::Index; use crate::file::page_index::offset_index::OffsetIndexMetaData; -use crate::thrift::TSerializable; +use crate::parquet_thrift::{ThriftCompactOutputProtocol, WriteThrift}; use std::fmt::Debug; use std::io::{BufWriter, IoSlice, Read}; use std::{io::Write, sync::Arc}; -use thrift::protocol::TCompactOutputProtocol; use crate::column::page_encryption::PageEncryptor; use crate::column::writer::{get_typed_column_writer_mut, ColumnCloseResult, ColumnWriterImpl}; @@ -939,15 +939,15 @@ impl<'a, W: Write> SerializedPageWriter<'a, W> { /// Serializes page header into Thrift. /// Returns number of bytes that have been written into the sink. #[inline] - fn serialize_page_header(&mut self, header: crate::format::PageHeader) -> Result { + fn serialize_page_header(&mut self, header: PageHeaderWithStats) -> Result { let start_pos = self.sink.bytes_written(); match self.page_encryptor_and_sink_mut() { Some((page_encryptor, sink)) => { page_encryptor.encrypt_page_header(&header, sink)?; } None => { - let mut protocol = TCompactOutputProtocol::new(&mut self.sink); - header.write_to_out_protocol(&mut protocol)?; + let mut protocol = ThriftCompactOutputProtocol::new(&mut self.sink); + header.write_thrift(&mut protocol)?; } } Ok(self.sink.bytes_written() - start_pos) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index ffa5409ac114..bdb53842b3bc 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -917,10 +917,7 @@ pub(crate) mod tests { pub(crate) fn test_roundtrip(val: T) where - T: for<'a> ReadThrift<'a, ThriftSliceInputProtocol<'a>> - + WriteThrift - + PartialEq - + Debug, + T: for<'a> ReadThrift<'a, ThriftSliceInputProtocol<'a>> + WriteThrift + PartialEq + Debug, { let buf = Vec::::new(); let mut writer = ThriftCompactOutputProtocol::new(buf); From 6af863178bf93f803960777751928d2d20949a25 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 28 Aug 2025 22:52:02 -0700 Subject: [PATCH 045/126] rename page header structs --- parquet/src/arrow/arrow_writer/mod.rs | 16 +++++------- parquet/src/column/page.rs | 14 +++++----- parquet/src/column/page_encryption.rs | 4 +-- .../src/column/page_encryption_disabled.rs | 4 +-- parquet/src/file/metadata/thrift_gen.rs | 21 +++++++-------- parquet/src/file/serialized_reader.rs | 26 ++++++++++++------- parquet/src/file/writer.rs | 6 +++-- parquet/tests/arrow_reader/bad_data.rs | 8 +++++- 8 files changed, 55 insertions(+), 44 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 598b324157f2..0c021aad0c69 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1486,14 +1486,12 @@ mod tests { use crate::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder}; use crate::arrow::ARROW_SCHEMA_META_KEY; use crate::column::page::{Page, PageReader}; - use crate::file::metadata::thrift_gen::PageHeaderWithStats; + use crate::file::metadata::thrift_gen::PageHeader; use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::reader::SerializedPageReader; - use crate::format::PageHeader; use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; use crate::schema::types::ColumnPath; - use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use arrow::datatypes::ToByteSlice; use arrow::datatypes::{DataType, Schema}; use arrow::error::Result as ArrowResult; @@ -4193,7 +4191,7 @@ mod tests { // decode first page header let first_page = &buf[4..]; let mut prot = ThriftSliceInputProtocol::new(first_page); - let hdr = PageHeaderWithStats::read_thrift(&mut prot).unwrap(); + let hdr = PageHeader::read_thrift(&mut prot).unwrap(); let stats = hdr.data_page_header.unwrap().statistics; assert!(stats.is_none()); @@ -4227,7 +4225,7 @@ mod tests { // decode first page header let first_page = &buf[4..]; let mut prot = ThriftSliceInputProtocol::new(first_page); - let hdr = PageHeaderWithStats::read_thrift(&mut prot).unwrap(); + let hdr = PageHeader::read_thrift(&mut prot).unwrap(); let stats = hdr.data_page_header.unwrap().statistics; let stats = stats.unwrap(); @@ -4278,8 +4276,8 @@ mod tests { // decode first page header let first_page = &buf[4..]; - let mut prot = TCompactSliceInputProtocol::new(first_page); - let hdr = PageHeader::read_from_in_protocol(&mut prot).unwrap(); + let mut prot = ThriftSliceInputProtocol::new(first_page); + let hdr = PageHeader::read_thrift(&mut prot).unwrap(); let stats = hdr.data_page_header.unwrap().statistics; assert!(stats.is_some()); let stats = stats.unwrap(); @@ -4291,8 +4289,8 @@ mod tests { // check second page now let second_page = &prot.as_slice()[hdr.compressed_page_size as usize..]; - let mut prot = TCompactSliceInputProtocol::new(second_page); - let hdr = PageHeader::read_from_in_protocol(&mut prot).unwrap(); + let mut prot = ThriftSliceInputProtocol::new(second_page); + let hdr = PageHeader::read_thrift(&mut prot).unwrap(); let stats = hdr.data_page_header.unwrap().statistics; assert!(stats.is_some()); let stats = stats.unwrap(); diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 556428fc9690..c60ed4d0cff4 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -22,7 +22,7 @@ use bytes::Bytes; use crate::basic::{Encoding, PageType}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::thrift_gen::{ - DataPageHeaderV2WithStats, DataPageHeaderWithStats, DictionaryPageHeader, PageHeaderWithStats, + DataPageHeaderV2, DataPageHeader, DictionaryPageHeader, PageHeader, }; use crate::file::statistics::{page_stats_to_thrift, Statistics}; @@ -198,14 +198,14 @@ impl CompressedPage { } /// Returns the thrift page header - pub(crate) fn to_thrift_header(&self) -> PageHeaderWithStats { + pub(crate) fn to_thrift_header(&self) -> PageHeader { let uncompressed_size = self.uncompressed_size(); let compressed_size = self.compressed_size(); let num_values = self.num_values(); let encoding = self.encoding(); let page_type = self.page_type(); - let mut page_header = PageHeaderWithStats { + let mut page_header = PageHeader { type_: page_type.into(), uncompressed_page_size: uncompressed_size as i32, compressed_page_size: compressed_size as i32, @@ -224,7 +224,7 @@ impl CompressedPage { ref statistics, .. } => { - let data_page_header = DataPageHeaderWithStats { + let data_page_header = DataPageHeader { num_values: num_values as i32, encoding: encoding.into(), definition_level_encoding: def_level_encoding.into(), @@ -242,7 +242,7 @@ impl CompressedPage { ref statistics, .. } => { - let data_page_header_v2 = DataPageHeaderV2WithStats { + let data_page_header_v2 = DataPageHeaderV2 { num_values: num_values as i32, num_nulls: num_nulls as i32, num_rows: num_rows as i32, @@ -333,11 +333,11 @@ pub struct PageMetadata { pub is_dict: bool, } -impl TryFrom<&crate::file::metadata::thrift_gen::PageHeader> for PageMetadata { +impl TryFrom<&crate::file::metadata::thrift_gen::PageHeaderNoStats> for PageMetadata { type Error = ParquetError; fn try_from( - value: &crate::file::metadata::thrift_gen::PageHeader, + value: &crate::file::metadata::thrift_gen::PageHeaderNoStats, ) -> std::result::Result { match value.type_ { PageType::DATA_PAGE => { diff --git a/parquet/src/column/page_encryption.rs b/parquet/src/column/page_encryption.rs index a491778a065c..7ee367a289c8 100644 --- a/parquet/src/column/page_encryption.rs +++ b/parquet/src/column/page_encryption.rs @@ -22,7 +22,7 @@ use crate::encryption::encrypt::{encrypt_thrift_object, FileEncryptor}; use crate::encryption::modules::{create_module_aad, ModuleType}; use crate::errors::ParquetError; use crate::errors::Result; -use crate::file::metadata::thrift_gen::PageHeaderWithStats; +use crate::file::metadata::thrift_gen::PageHeader; use bytes::Bytes; use std::io::Write; use std::sync::Arc; @@ -92,7 +92,7 @@ impl PageEncryptor { /// Encrypt a column page header pub fn encrypt_page_header( &mut self, - page_header: &PageHeaderWithStats, + page_header: &PageHeader, sink: &mut W, ) -> Result<()> { let module_type = match page_header.type_ { diff --git a/parquet/src/column/page_encryption_disabled.rs b/parquet/src/column/page_encryption_disabled.rs index a028881f5b51..347024f7f21f 100644 --- a/parquet/src/column/page_encryption_disabled.rs +++ b/parquet/src/column/page_encryption_disabled.rs @@ -17,7 +17,7 @@ use crate::column::page::CompressedPage; use crate::errors::Result; -use crate::file::metadata::thrift_gen::PageHeaderWithStats; +use crate::file::metadata::thrift_gen::PageHeader; use std::io::Write; #[derive(Debug)] @@ -36,7 +36,7 @@ impl PageEncryptor { pub fn encrypt_page_header( &mut self, - _page_header: &PageHeaderWithStats, + _page_header: &PageHeader, _sink: &mut W, ) -> Result<()> { unreachable!("The encryption feature is disabled") diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index ceb631519ee4..d20902fa0644 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -817,7 +817,7 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ParquetMetaDat // we don't ever use those stats, but they take an inordinate // amount of time to decode. thrift_struct!( -pub(crate) struct DataPageHeader { +pub(crate) struct DataPageHeaderNoStats { 1: required i32 num_values 2: required Encoding encoding 3: required Encoding definition_level_encoding; @@ -826,7 +826,7 @@ pub(crate) struct DataPageHeader { ); thrift_struct!( -pub(crate) struct DataPageHeaderV2 { +pub(crate) struct DataPageHeaderV2NoStats { 1: required i32 num_values 2: required i32 num_nulls 3: required i32 num_rows @@ -855,8 +855,7 @@ pub(crate) struct DictionaryPageHeader { ); thrift_struct!( -#[allow(dead_code)] -pub(crate) struct PageHeader { +pub(crate) struct PageHeaderNoStats { /// the type of the page: indicates which of the *_header fields is set 1: required PageType type_ @@ -870,10 +869,10 @@ pub(crate) struct PageHeader { 4: optional i32 crc // Headers for page specific data. One only will be set. - 5: optional DataPageHeader data_page_header; + 5: optional DataPageHeaderNoStats data_page_header; 6: optional IndexPageHeader index_page_header; 7: optional DictionaryPageHeader dictionary_page_header; - 8: optional DataPageHeaderV2 data_page_header_v2; + 8: optional DataPageHeaderV2NoStats data_page_header_v2; } ); @@ -892,7 +891,7 @@ pub(crate) struct PageStatistics { ); thrift_struct!( -pub(crate) struct DataPageHeaderWithStats { +pub(crate) struct DataPageHeader { 1: required i32 num_values 2: required Encoding encoding 3: required Encoding definition_level_encoding; @@ -902,7 +901,7 @@ pub(crate) struct DataPageHeaderWithStats { ); thrift_struct!( -pub(crate) struct DataPageHeaderV2WithStats { +pub(crate) struct DataPageHeaderV2 { 1: required i32 num_values 2: required i32 num_nulls 3: required i32 num_rows @@ -916,7 +915,7 @@ pub(crate) struct DataPageHeaderV2WithStats { thrift_struct!( #[allow(dead_code)] -pub(crate) struct PageHeaderWithStats { +pub(crate) struct PageHeader { /// the type of the page: indicates which of the *_header fields is set 1: required PageType type_ @@ -930,10 +929,10 @@ pub(crate) struct PageHeaderWithStats { 4: optional i32 crc // Headers for page specific data. One only will be set. - 5: optional DataPageHeaderWithStats data_page_header; + 5: optional DataPageHeader data_page_header; 6: optional IndexPageHeader index_page_header; 7: optional DictionaryPageHeader dictionary_page_header; - 8: optional DataPageHeaderV2WithStats data_page_header_v2; + 8: optional DataPageHeaderV2 data_page_header_v2; } ); diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 371d0fe72e9c..06370afe84c0 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -25,7 +25,7 @@ use crate::compression::{create_codec, Codec}; #[cfg(feature = "encryption")] use crate::encryption::decrypt::{read_and_decrypt, CryptoContext}; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::thrift_gen::PageHeader; +use crate::file::metadata::thrift_gen::PageHeaderNoStats; use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use crate::file::{ metadata::*, @@ -337,7 +337,7 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R /// Decodes a [`Page`] from the provided `buffer` pub(crate) fn decode_page( - page_header: PageHeader, + page_header: PageHeaderNoStats, buffer: Bytes, decompressor: Option<&mut Box>, ) -> Result { @@ -471,7 +471,7 @@ enum SerializedPageReaderState { remaining_bytes: u64, // If the next page header has already been "peeked", we will cache it and it`s length here - next_page_header: Option>, + next_page_header: Option>, /// The index of the data page within this column chunk page_index: usize, @@ -678,7 +678,7 @@ impl SerializedPageReader { input: &mut T, page_index: usize, dictionary_page: bool, - ) -> Result<(usize, PageHeader)> { + ) -> Result<(usize, PageHeaderNoStats)> { /// A wrapper around a [`std::io::Read`] that keeps track of the bytes read struct TrackedRead { inner: R, @@ -706,7 +706,7 @@ impl SerializedPageReader { buffer: &[u8], page_index: usize, dictionary_page: bool, - ) -> Result<(usize, PageHeader)> { + ) -> Result<(usize, PageHeaderNoStats)> { let mut input = std::io::Cursor::new(buffer); let header = context.read_page_header(&mut input, page_index, dictionary_page)?; let header_len = input.position() as usize; @@ -721,11 +721,11 @@ impl SerializedPageReaderContext { input: &mut T, _page_index: usize, _dictionary_page: bool, - ) -> Result { + ) -> Result { use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; let mut prot = ThriftReadInputProtocol::new(input); - Ok(PageHeader::read_thrift(&mut prot)?) + Ok(PageHeaderNoStats::read_thrift(&mut prot)?) } fn decrypt_page_data( @@ -745,13 +745,13 @@ impl SerializedPageReaderContext { input: &mut T, page_index: usize, dictionary_page: bool, - ) -> Result { + ) -> Result { match self.page_crypto_context(page_index, dictionary_page) { None => { use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; let mut prot = ThriftReadInputProtocol::new(input); - Ok(PageHeader::read_thrift(&mut prot)?) + Ok(PageHeaderNoStats::read_thrift(&mut prot)?) } Some(page_crypto_context) => { use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; @@ -767,7 +767,7 @@ impl SerializedPageReaderContext { })?; let mut prot = ThriftSliceInputProtocol::new(buf.as_slice()); - Ok(PageHeader::read_thrift(&mut prot)?) + Ok(PageHeaderNoStats::read_thrift(&mut prot)?) } } } @@ -1862,6 +1862,12 @@ mod tests { 80, 65, 82, 49, ]; let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data)); + #[cfg(feature = "encryption")] + assert_eq!( + ret.err().unwrap().to_string(), + "Parquet error: Could not parse metadata: Parquet error: Received empty union from remote ColumnOrder" + ); + #[cfg(not(feature = "encryption"))] assert_eq!( ret.err().unwrap().to_string(), "Parquet error: Received empty union from remote ColumnOrder" diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index c62b7295a771..f914871620bd 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -19,7 +19,7 @@ //! using row group writers and column writers respectively. use crate::bloom_filter::Sbbf; -use crate::file::metadata::thrift_gen::PageHeaderWithStats; +use crate::file::metadata::thrift_gen::PageHeader; use crate::file::page_index::index::Index; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::parquet_thrift::{ThriftCompactOutputProtocol, WriteThrift}; @@ -939,7 +939,7 @@ impl<'a, W: Write> SerializedPageWriter<'a, W> { /// Serializes page header into Thrift. /// Returns number of bytes that have been written into the sink. #[inline] - fn serialize_page_header(&mut self, header: PageHeaderWithStats) -> Result { + fn serialize_page_header(&mut self, header: PageHeader) -> Result { let start_pos = self.sink.bytes_written(); match self.page_encryptor_and_sink_mut() { Some((page_encryptor, sink)) => { @@ -1426,6 +1426,7 @@ mod tests { } #[test] + #[ignore = "still working on page stats"] fn test_page_writer_data_pages() { let pages = vec![ Page::DataPage { @@ -1454,6 +1455,7 @@ mod tests { } #[test] + #[ignore = "still working on page stats"] fn test_page_writer_dict_pages() { let pages = vec![ Page::DictionaryPage { diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index 58e342ab39d1..a2d8af538672 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -80,10 +80,16 @@ fn test_invalid_files() { #[test] fn test_parquet_1481() { let err = read_file("PARQUET-1481.parquet").unwrap_err(); + #[cfg(feature="encryption")] assert_eq!( err.to_string(), "Parquet error: Could not parse metadata: Parquet error: Unexpected Type -7" ); + #[cfg(not(feature="encryption"))] + assert_eq!( + err.to_string(), + "Parquet error: Unexpected Type -7" + ); } #[test] @@ -98,7 +104,7 @@ fn test_arrow_gh_41317() { let err = read_file("ARROW-GH-41317.parquet").unwrap_err(); assert_eq!( err.to_string(), - "External: Parquet argument error: External: bad data" + "External: Parquet argument error: Parquet error: StructArrayReader out of sync in read_records, expected 5 read, got 2" ); } From 377522294371d29169a2d69b86b3abf8ba88bf9b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 07:27:19 -0700 Subject: [PATCH 046/126] add some fixmes --- parquet/src/basic.rs | 2 +- parquet/src/file/writer.rs | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 61c277c88825..ab058388413f 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -768,7 +768,7 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for Compression { } } -// FIXME +// FIXME(ets) // ugh...why did we add compression level to some variants if we don't use them???? impl WriteThrift for Compression { const ELEMENT_TYPE: ElementType = ElementType::I32; diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index f914871620bd..9b0e8b1da698 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1425,6 +1425,7 @@ mod tests { ); } + // FIXME(ets) #[test] #[ignore = "still working on page stats"] fn test_page_writer_data_pages() { @@ -1454,6 +1455,7 @@ mod tests { test_page_roundtrip(&pages[..], Compression::UNCOMPRESSED, Type::INT32); } + // FIXME(ets) #[test] #[ignore = "still working on page stats"] fn test_page_writer_dict_pages() { From 85f44a5a6c17c47b33441c3594dd0c9053444586 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 07:29:08 -0700 Subject: [PATCH 047/126] formatting --- parquet/src/column/page.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index c60ed4d0cff4..e6ddbbe8ea7a 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -22,7 +22,7 @@ use bytes::Bytes; use crate::basic::{Encoding, PageType}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::thrift_gen::{ - DataPageHeaderV2, DataPageHeader, DictionaryPageHeader, PageHeader, + DataPageHeader, DataPageHeaderV2, DictionaryPageHeader, PageHeader, }; use crate::file::statistics::{page_stats_to_thrift, Statistics}; From f0e538f26b752f7e88679fabc5350d9416fdf713 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 07:30:31 -0700 Subject: [PATCH 048/126] test results differ depending on features --- parquet/tests/arrow_reader/bad_data.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index a2d8af538672..be401030e7f9 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -80,16 +80,13 @@ fn test_invalid_files() { #[test] fn test_parquet_1481() { let err = read_file("PARQUET-1481.parquet").unwrap_err(); - #[cfg(feature="encryption")] + #[cfg(feature = "encryption")] assert_eq!( err.to_string(), "Parquet error: Could not parse metadata: Parquet error: Unexpected Type -7" ); - #[cfg(not(feature="encryption"))] - assert_eq!( - err.to_string(), - "Parquet error: Unexpected Type -7" - ); + #[cfg(not(feature = "encryption"))] + assert_eq!(err.to_string(), "Parquet error: Unexpected Type -7"); } #[test] From 763ecd7ff1ed96ad1ce731d9092dc199a0a08404 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 07:30:58 -0700 Subject: [PATCH 049/126] error rather than panic on missing required fields --- parquet/src/file/page_encoding_stats.rs | 2 +- parquet/src/parquet_macros.rs | 12 +++++++----- parquet/tests/encryption/encryption_agnostic.rs | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index 934e177de0da..3f81353e28dd 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -20,7 +20,7 @@ use std::io::Write; use crate::basic::{Encoding, PageType}; -use crate::errors::Result; +use crate::errors::{ParquetError, Result}; use crate::parquet_thrift::{ ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index ab18e5199bea..5a3da53bcd47 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -451,11 +451,13 @@ macro_rules! __thrift_required_or_optional { #[macro_export] macro_rules! __thrift_result_required_or_optional { (required $field_name:ident) => { - let $field_name = $field_name.expect(concat!( - "Required field ", - stringify!($field_name), - " is missing", - )); + let Some($field_name) = $field_name else { + return Err(general_err!(concat!( + "Required field ", + stringify!($field_name), + " is missing", + ))); + }; }; (optional $field_name:ident) => {}; } diff --git a/parquet/tests/encryption/encryption_agnostic.rs b/parquet/tests/encryption/encryption_agnostic.rs index e071471712f4..48b5c77d9b97 100644 --- a/parquet/tests/encryption/encryption_agnostic.rs +++ b/parquet/tests/encryption/encryption_agnostic.rs @@ -72,7 +72,7 @@ pub fn read_plaintext_footer_file_without_decryption_properties() { match record_reader.next() { Some(Err(ArrowError::ParquetError(s))) => { - assert!(s.contains("protocol error")); + assert!(s.contains("Parquet error")); } _ => { panic!("Expected ArrowError::ParquetError"); @@ -137,7 +137,7 @@ pub async fn read_plaintext_footer_file_without_decryption_properties_async() { match record_reader.next().await { Some(Err(ParquetError::ArrowError(s))) => { - assert!(s.contains("protocol error")); + assert!(s.contains("Parquet error")); } _ => { panic!("Expected ArrowError::ParquetError"); From 734ee9beda3f5b592089050f99ec9b0a2695c815 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 08:52:26 -0700 Subject: [PATCH 050/126] add option to read page stats --- parquet/src/column/page.rs | 4 +- parquet/src/column/writer/mod.rs | 8 +- parquet/src/file/metadata/thrift_gen.rs | 304 ++++++++++++++++++++---- parquet/src/file/properties.rs | 24 ++ parquet/src/file/serialized_reader.rs | 73 ++++-- parquet/src/file/statistics.rs | 151 ++++++++++++ parquet/src/file/writer.rs | 5 +- 7 files changed, 492 insertions(+), 77 deletions(-) diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index e6ddbbe8ea7a..3677bc5043ad 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -333,11 +333,11 @@ pub struct PageMetadata { pub is_dict: bool, } -impl TryFrom<&crate::file::metadata::thrift_gen::PageHeaderNoStats> for PageMetadata { +impl TryFrom<&crate::file::metadata::thrift_gen::PageHeader> for PageMetadata { type Error = ParquetError; fn try_from( - value: &crate::file::metadata::thrift_gen::PageHeaderNoStats, + value: &crate::file::metadata::thrift_gen::PageHeader, ) -> std::result::Result { match value.type_ { PageType::DATA_PAGE => { diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 33fd6285c7c4..a9ce2eb57212 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2253,6 +2253,7 @@ mod tests { let props = ReaderProperties::builder() .set_backward_compatible_lz4(false) + .set_read_page_statistics(true) .build(); let reader = SerializedPageReader::new_with_properties( Arc::new(Bytes::from(buf)), @@ -2266,13 +2267,10 @@ mod tests { let pages = reader.collect::>>().unwrap(); assert_eq!(pages.len(), 2); - /* TODO(ets): need to seek to the data page and decode ourselves since - * the stats are no longer read - */ assert_eq!(pages[0].page_type(), PageType::DICTIONARY_PAGE); assert_eq!(pages[1].page_type(), PageType::DATA_PAGE); - /*let page_statistics = pages[1].statistics().unwrap(); + let page_statistics = pages[1].statistics().unwrap(); assert_eq!( page_statistics.min_bytes_opt().unwrap(), 1_i32.to_le_bytes() @@ -2282,7 +2280,7 @@ mod tests { 7_i32.to_le_bytes() ); assert_eq!(page_statistics.null_count_opt(), Some(0)); - assert!(page_statistics.distinct_count_opt().is_none());*/ + assert!(page_statistics.distinct_count_opt().is_none()); } #[test] diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index d20902fa0644..403f0005b3c2 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -812,31 +812,6 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ParquetMetaDat } } -// the following structures are only meant for reading. the -// statistics have been removed from the data page headers since -// we don't ever use those stats, but they take an inordinate -// amount of time to decode. -thrift_struct!( -pub(crate) struct DataPageHeaderNoStats { - 1: required i32 num_values - 2: required Encoding encoding - 3: required Encoding definition_level_encoding; - 4: required Encoding repetition_level_encoding; -} -); - -thrift_struct!( -pub(crate) struct DataPageHeaderV2NoStats { - 1: required i32 num_values - 2: required i32 num_nulls - 3: required i32 num_rows - 4: required Encoding encoding - 5: required i32 definition_levels_byte_length; - 6: required i32 repetition_levels_byte_length; - 7: optional bool is_compressed = true; -} -); - thrift_struct!( pub(crate) struct IndexPageHeader {} ); @@ -854,29 +829,9 @@ pub(crate) struct DictionaryPageHeader { } ); -thrift_struct!( -pub(crate) struct PageHeaderNoStats { - /// the type of the page: indicates which of the *_header fields is set - 1: required PageType type_ - - /// Uncompressed page size in bytes (not including this header) - 2: required i32 uncompressed_page_size - - /// Compressed (and potentially encrypted) page size in bytes, not including this header - 3: required i32 compressed_page_size - - /// The 32-bit CRC checksum for the page, to be be calculated as follows: - 4: optional i32 crc - - // Headers for page specific data. One only will be set. - 5: optional DataPageHeaderNoStats data_page_header; - 6: optional IndexPageHeader index_page_header; - 7: optional DictionaryPageHeader dictionary_page_header; - 8: optional DataPageHeaderV2NoStats data_page_header_v2; -} -); - -// these page headers are for the write side...they have statistics that don't require lifetimes +// Statistics for the page header. This is separate because of the differing lifetime requirements +// for page handling vs column chunk. Once we start writing column chunks this might need to be +// revisited. thrift_struct!( pub(crate) struct PageStatistics { 1: optional binary max; @@ -900,6 +855,75 @@ pub(crate) struct DataPageHeader { } ); +impl DataPageHeader { + fn read_thrift_without_stats<'a, R>(prot: &mut R) -> Result + where + R: ThriftCompactInputProtocol<'a>, + { + let mut num_values: Option = None; + let mut encoding: Option = None; + let mut definition_level_encoding: Option = None; + let mut repetition_level_encoding: Option = None; + let statistics: Option = None; + let mut last_field_id = 0i16; + loop { + let field_ident = prot.read_field_begin(last_field_id)?; + if field_ident.field_type == FieldType::Stop { + break; + } + match field_ident.id { + 1 => { + let val = i32::read_thrift(&mut *prot)?; + num_values = Some(val); + } + 2 => { + let val = Encoding::read_thrift(&mut *prot)?; + encoding = Some(val); + } + 3 => { + let val = Encoding::read_thrift(&mut *prot)?; + definition_level_encoding = Some(val); + } + 4 => { + let val = Encoding::read_thrift(&mut *prot)?; + repetition_level_encoding = Some(val); + } + _ => { + prot.skip(field_ident.field_type)?; + } + }; + last_field_id = field_ident.id; + } + let Some(num_values) = num_values else { + return Err(ParquetError::General( + "Required field num_values is missing".to_owned(), + )); + }; + let Some(encoding) = encoding else { + return Err(ParquetError::General( + "Required field encoding is missing".to_owned(), + )); + }; + let Some(definition_level_encoding) = definition_level_encoding else { + return Err(ParquetError::General( + "Required field definition_level_encoding is missing".to_owned(), + )); + }; + let Some(repetition_level_encoding) = repetition_level_encoding else { + return Err(ParquetError::General( + "Required field repetition_level_encoding is missing".to_owned(), + )); + }; + Ok(Self { + num_values, + encoding, + definition_level_encoding, + repetition_level_encoding, + statistics, + }) + } +} + thrift_struct!( pub(crate) struct DataPageHeaderV2 { 1: required i32 num_values @@ -913,8 +937,104 @@ pub(crate) struct DataPageHeaderV2 { } ); +impl DataPageHeaderV2 { + fn read_thrift_without_stats<'a, R>(prot: &mut R) -> Result + where + R: ThriftCompactInputProtocol<'a>, + { + let mut num_values: Option = None; + let mut num_nulls: Option = None; + let mut num_rows: Option = None; + let mut encoding: Option = None; + let mut definition_levels_byte_length: Option = None; + let mut repetition_levels_byte_length: Option = None; + let mut is_compressed: Option = None; + let statistics: Option = None; + let mut last_field_id = 0i16; + loop { + let field_ident = prot.read_field_begin(last_field_id)?; + if field_ident.field_type == FieldType::Stop { + break; + } + match field_ident.id { + 1 => { + let val = i32::read_thrift(&mut *prot)?; + num_values = Some(val); + } + 2 => { + let val = i32::read_thrift(&mut *prot)?; + num_nulls = Some(val); + } + 3 => { + let val = i32::read_thrift(&mut *prot)?; + num_rows = Some(val); + } + 4 => { + let val = Encoding::read_thrift(&mut *prot)?; + encoding = Some(val); + } + 5 => { + let val = i32::read_thrift(&mut *prot)?; + definition_levels_byte_length = Some(val); + } + 6 => { + let val = i32::read_thrift(&mut *prot)?; + repetition_levels_byte_length = Some(val); + } + 7 => { + let val = field_ident.bool_val.unwrap(); + is_compressed = Some(val); + } + _ => { + prot.skip(field_ident.field_type)?; + } + }; + last_field_id = field_ident.id; + } + let Some(num_values) = num_values else { + return Err(ParquetError::General( + "Required field num_values is missing".to_owned(), + )); + }; + let Some(num_nulls) = num_nulls else { + return Err(ParquetError::General( + "Required field num_nulls is missing".to_owned(), + )); + }; + let Some(num_rows) = num_rows else { + return Err(ParquetError::General( + "Required field num_rows is missing".to_owned(), + )); + }; + let Some(encoding) = encoding else { + return Err(ParquetError::General( + "Required field encoding is missing".to_owned(), + )); + }; + let Some(definition_levels_byte_length) = definition_levels_byte_length else { + return Err(ParquetError::General( + "Required field definition_levels_byte_length is missing".to_owned(), + )); + }; + let Some(repetition_levels_byte_length) = repetition_levels_byte_length else { + return Err(ParquetError::General( + "Required field repetition_levels_byte_length is missing".to_owned(), + )); + }; + Ok(Self { + num_values, + num_nulls, + num_rows, + encoding, + definition_levels_byte_length, + repetition_levels_byte_length, + is_compressed, + statistics, + }) + } +} + thrift_struct!( -#[allow(dead_code)] pub(crate) struct PageHeader { /// the type of the page: indicates which of the *_header fields is set 1: required PageType type_ @@ -936,6 +1056,92 @@ pub(crate) struct PageHeader { } ); +impl PageHeader { + pub(crate) fn read_thrift_without_stats<'a, R>(prot: &mut R) -> Result + where + R: ThriftCompactInputProtocol<'a>, + { + let mut type_: Option = None; + let mut uncompressed_page_size: Option = None; + let mut compressed_page_size: Option = None; + let mut crc: Option = None; + let mut data_page_header: Option = None; + let mut index_page_header: Option = None; + let mut dictionary_page_header: Option = None; + let mut data_page_header_v2: Option = None; + let mut last_field_id = 0i16; + loop { + let field_ident = prot.read_field_begin(last_field_id)?; + if field_ident.field_type == FieldType::Stop { + break; + } + match field_ident.id { + 1 => { + let val = PageType::read_thrift(&mut *prot)?; + type_ = Some(val); + } + 2 => { + let val = i32::read_thrift(&mut *prot)?; + uncompressed_page_size = Some(val); + } + 3 => { + let val = i32::read_thrift(&mut *prot)?; + compressed_page_size = Some(val); + } + 4 => { + let val = i32::read_thrift(&mut *prot)?; + crc = Some(val); + } + 5 => { + let val = DataPageHeader::read_thrift_without_stats(&mut *prot)?; + data_page_header = Some(val); + } + 6 => { + let val = IndexPageHeader::read_thrift(&mut *prot)?; + index_page_header = Some(val); + } + 7 => { + let val = DictionaryPageHeader::read_thrift(&mut *prot)?; + dictionary_page_header = Some(val); + } + 8 => { + let val = DataPageHeaderV2::read_thrift_without_stats(&mut *prot)?; + data_page_header_v2 = Some(val); + } + _ => { + prot.skip(field_ident.field_type)?; + } + }; + last_field_id = field_ident.id; + } + let Some(type_) = type_ else { + return Err(ParquetError::General( + "Required field type_ is missing".to_owned(), + )); + }; + let Some(uncompressed_page_size) = uncompressed_page_size else { + return Err(ParquetError::General( + "Required field uncompressed_page_size is missing".to_owned(), + )); + }; + let Some(compressed_page_size) = compressed_page_size else { + return Err(ParquetError::General( + "Required field compressed_page_size is missing".to_owned(), + )); + }; + Ok(Self { + type_, + uncompressed_page_size, + compressed_page_size, + crc, + data_page_header, + index_page_header, + dictionary_page_header, + data_page_header_v2, + }) + } +} + #[cfg(test)] mod tests { use crate::file::metadata::thrift_gen::BoundingBox; diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 46e43ec6ab4b..8afc89158fe6 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -1160,6 +1160,7 @@ impl ColumnProperties { pub type ReaderPropertiesPtr = Arc; const DEFAULT_READ_BLOOM_FILTER: bool = false; +const DEFAULT_READ_PAGE_STATS: bool = false; /// Configuration settings for reading parquet files. /// @@ -1182,6 +1183,7 @@ const DEFAULT_READ_BLOOM_FILTER: bool = false; pub struct ReaderProperties { codec_options: CodecOptions, read_bloom_filter: bool, + read_page_stats: bool, } impl ReaderProperties { @@ -1199,6 +1201,11 @@ impl ReaderProperties { pub(crate) fn read_bloom_filter(&self) -> bool { self.read_bloom_filter } + + /// Returns whether to read page level statistics + pub(crate) fn read_page_stats(&self) -> bool { + self.read_page_stats + } } /// Builder for parquet file reader configuration. See example on @@ -1206,6 +1213,7 @@ impl ReaderProperties { pub struct ReaderPropertiesBuilder { codec_options_builder: CodecOptionsBuilder, read_bloom_filter: Option, + read_page_stats: Option, } /// Reader properties builder. @@ -1215,6 +1223,7 @@ impl ReaderPropertiesBuilder { Self { codec_options_builder: CodecOptionsBuilder::default(), read_bloom_filter: None, + read_page_stats: None, } } @@ -1223,6 +1232,7 @@ impl ReaderPropertiesBuilder { ReaderProperties { codec_options: self.codec_options_builder.build(), read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER), + read_page_stats: self.read_page_stats.unwrap_or(DEFAULT_READ_PAGE_STATS), } } @@ -1251,6 +1261,20 @@ impl ReaderPropertiesBuilder { self.read_bloom_filter = Some(value); self } + + /// Enable/disable reading page-level statistics + /// + /// If set to `true`, then the reader will decode and populate the [`Statistics`] for + /// each page, if present. + /// If set to `false`, then the reader will skip decoding the statistics. + /// + /// Byte default statistics will not be decoded. + /// + /// [`Statistics`]: crate::file::statistics::Statistics + pub fn set_read_page_statistics(mut self, value: bool) -> Self { + self.read_page_stats = Some(value); + self + } } #[cfg(test)] diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 06370afe84c0..15b6c6be65e0 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -18,15 +18,16 @@ //! Contains implementations of the reader traits FileReader, RowGroupReader and PageReader //! Also contains implementations of the ChunkReader for files (with buffering) and byte arrays (RAM) -use crate::basic::PageType; +use crate::basic::{PageType, Type}; use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; #[cfg(feature = "encryption")] use crate::encryption::decrypt::{read_and_decrypt, CryptoContext}; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::thrift_gen::PageHeaderNoStats; +use crate::file::metadata::thrift_gen::PageHeader; use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; +use crate::file::statistics; use crate::file::{ metadata::*, properties::{ReaderProperties, ReaderPropertiesPtr}, @@ -337,8 +338,9 @@ impl RowGroupReader for SerializedRowGroupReader<'_, R /// Decodes a [`Page`] from the provided `buffer` pub(crate) fn decode_page( - page_header: PageHeaderNoStats, + page_header: PageHeader, buffer: Bytes, + physical_type: Type, decompressor: Option<&mut Box>, ) -> Result { // Verify the 32-bit CRC checksum of the page @@ -431,7 +433,7 @@ pub(crate) fn decode_page( encoding: header.encoding, def_level_encoding: header.definition_level_encoding, rep_level_encoding: header.repetition_level_encoding, - statistics: None, + statistics: statistics::from_thrift_page_stats(physical_type, header.statistics)?, } } PageType::DATA_PAGE_V2 => { @@ -448,7 +450,7 @@ pub(crate) fn decode_page( def_levels_byte_len: header.definition_levels_byte_length.try_into()?, rep_levels_byte_len: header.repetition_levels_byte_length.try_into()?, is_compressed, - statistics: None, + statistics: statistics::from_thrift_page_stats(physical_type, header.statistics)?, } } _ => { @@ -471,7 +473,7 @@ enum SerializedPageReaderState { remaining_bytes: u64, // If the next page header has already been "peeked", we will cache it and it`s length here - next_page_header: Option>, + next_page_header: Option>, /// The index of the data page within this column chunk page_index: usize, @@ -493,6 +495,8 @@ enum SerializedPageReaderState { #[derive(Default)] struct SerializedPageReaderContext { + /// Controls decoding of page-level statistics + read_stats: bool, /// Crypto context carrying objects required for decryption #[cfg(feature = "encryption")] crypto_context: Option>, @@ -506,6 +510,9 @@ pub struct SerializedPageReader { /// The compression codec for this column chunk. Only set for non-PLAIN codec. decompressor: Option>, + /// Column chunk type. + physical_type: Type, + state: SerializedPageReaderState, context: SerializedPageReaderContext, @@ -601,11 +608,16 @@ impl SerializedPageReader { require_dictionary: meta.dictionary_page_offset().is_some(), }, }; + let mut context = SerializedPageReaderContext::default(); + if props.read_page_stats() { + context.read_stats = true; + } Ok(Self { reader, decompressor, state, - context: Default::default(), + physical_type: meta.column_type(), + context, }) } @@ -678,7 +690,7 @@ impl SerializedPageReader { input: &mut T, page_index: usize, dictionary_page: bool, - ) -> Result<(usize, PageHeaderNoStats)> { + ) -> Result<(usize, PageHeader)> { /// A wrapper around a [`std::io::Read`] that keeps track of the bytes read struct TrackedRead { inner: R, @@ -706,7 +718,7 @@ impl SerializedPageReader { buffer: &[u8], page_index: usize, dictionary_page: bool, - ) -> Result<(usize, PageHeaderNoStats)> { + ) -> Result<(usize, PageHeader)> { let mut input = std::io::Cursor::new(buffer); let header = context.read_page_header(&mut input, page_index, dictionary_page)?; let header_len = input.position() as usize; @@ -721,11 +733,15 @@ impl SerializedPageReaderContext { input: &mut T, _page_index: usize, _dictionary_page: bool, - ) -> Result { + ) -> Result { use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; let mut prot = ThriftReadInputProtocol::new(input); - Ok(PageHeaderNoStats::read_thrift(&mut prot)?) + if self.read_stats { + Ok(PageHeader::read_thrift(&mut prot)?) + } else { + Ok(PageHeader::read_thrift_without_stats(&mut prot)?) + } } fn decrypt_page_data( @@ -745,13 +761,19 @@ impl SerializedPageReaderContext { input: &mut T, page_index: usize, dictionary_page: bool, - ) -> Result { + ) -> Result { match self.page_crypto_context(page_index, dictionary_page) { None => { use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; let mut prot = ThriftReadInputProtocol::new(input); - Ok(PageHeaderNoStats::read_thrift(&mut prot)?) + if self.read_stats { + Ok(PageHeader::read_thrift(&mut prot)?) + } else { + use crate::file::metadata::thrift_gen::PageHeader; + + Ok(PageHeader::read_thrift_without_stats(&mut prot)?) + } } Some(page_crypto_context) => { use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; @@ -767,7 +789,15 @@ impl SerializedPageReaderContext { })?; let mut prot = ThriftSliceInputProtocol::new(buf.as_slice()); - Ok(PageHeaderNoStats::read_thrift(&mut prot)?) + if self.read_stats { + use crate::file::metadata::thrift_gen::PageHeader; + + Ok(PageHeader::read_thrift(&mut prot)?) + } else { + use crate::file::metadata::thrift_gen::PageHeader; + + Ok(PageHeader::read_thrift_without_stats(&mut prot)?) + } } } } @@ -890,8 +920,12 @@ impl PageReader for SerializedPageReader { self.context .decrypt_page_data(buffer, *page_index, *require_dictionary)?; - let page = - decode_page(header, Bytes::from(buffer), self.decompressor.as_mut())?; + let page = decode_page( + header, + Bytes::from(buffer), + self.physical_type, + self.decompressor.as_mut(), + )?; if page.is_data_page() { *page_index += 1; } else if page.is_dictionary_page() { @@ -930,7 +964,12 @@ impl PageReader for SerializedPageReader { if !is_dictionary_page { *page_index += 1; } - decode_page(header, bytes, self.decompressor.as_mut())? + decode_page( + header, + bytes, + self.physical_type, + self.decompressor.as_mut(), + )? } }; diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 674b055f34ed..e51f445b7e7e 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -118,6 +118,7 @@ macro_rules! statistics_enum_func { }}; } +// FIXME(ets): remove this when done with format changes /// Converts Thrift definition into `Statistics`. pub fn from_thrift( physical_type: Type, @@ -267,6 +268,156 @@ pub fn from_thrift( }) } +/// Converts Thrift definition into `Statistics`. +pub(crate) fn from_thrift_page_stats( + physical_type: Type, + thrift_stats: Option, +) -> Result> { + Ok(match thrift_stats { + Some(stats) => { + // Number of nulls recorded, when it is not available, we just mark it as 0. + // TODO this should be `None` if there is no information about NULLS. + // see https://github.com/apache/arrow-rs/pull/6216/files + let null_count = stats.null_count.unwrap_or(0); + + if null_count < 0 { + return Err(ParquetError::General(format!( + "Statistics null count is negative {null_count}", + ))); + } + + // Generic null count. + let null_count = Some(null_count as u64); + // Generic distinct count (count of distinct values occurring) + let distinct_count = stats.distinct_count.map(|value| value as u64); + // Whether or not statistics use deprecated min/max fields. + let old_format = stats.min_value.is_none() && stats.max_value.is_none(); + // Generic min value as bytes. + let min = if old_format { + stats.min + } else { + stats.min_value + }; + // Generic max value as bytes. + let max = if old_format { + stats.max + } else { + stats.max_value + }; + + fn check_len(min: &Option>, max: &Option>, len: usize) -> Result<()> { + if let Some(min) = min { + if min.len() < len { + return Err(ParquetError::General( + "Insufficient bytes to parse min statistic".to_string(), + )); + } + } + if let Some(max) = max { + if max.len() < len { + return Err(ParquetError::General( + "Insufficient bytes to parse max statistic".to_string(), + )); + } + } + Ok(()) + } + + match physical_type { + Type::BOOLEAN => check_len(&min, &max, 1), + Type::INT32 | Type::FLOAT => check_len(&min, &max, 4), + Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8), + Type::INT96 => check_len(&min, &max, 12), + _ => Ok(()), + }?; + + // Values are encoded using PLAIN encoding definition, except that + // variable-length byte arrays do not include a length prefix. + // + // Instead of using actual decoder, we manually convert values. + let res = match physical_type { + Type::BOOLEAN => Statistics::boolean( + min.map(|data| data[0] != 0), + max.map(|data| data[0] != 0), + distinct_count, + null_count, + old_format, + ), + Type::INT32 => Statistics::int32( + min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())), + max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())), + distinct_count, + null_count, + old_format, + ), + Type::INT64 => Statistics::int64( + min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())), + max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())), + distinct_count, + null_count, + old_format, + ), + Type::INT96 => { + // INT96 statistics may not be correct, because comparison is signed + let min = if let Some(data) = min { + assert_eq!(data.len(), 12); + Some(Int96::try_from_le_slice(&data)?) + } else { + None + }; + let max = if let Some(data) = max { + assert_eq!(data.len(), 12); + Some(Int96::try_from_le_slice(&data)?) + } else { + None + }; + Statistics::int96(min, max, distinct_count, null_count, old_format) + } + Type::FLOAT => Statistics::float( + min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())), + max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())), + distinct_count, + null_count, + old_format, + ), + Type::DOUBLE => Statistics::double( + min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())), + max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())), + distinct_count, + null_count, + old_format, + ), + Type::BYTE_ARRAY => Statistics::ByteArray( + ValueStatistics::new( + min.map(ByteArray::from), + max.map(ByteArray::from), + distinct_count, + null_count, + old_format, + ) + .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) + .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), + ), + Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray( + ValueStatistics::new( + min.map(ByteArray::from).map(FixedLenByteArray::from), + max.map(ByteArray::from).map(FixedLenByteArray::from), + distinct_count, + null_count, + old_format, + ) + .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) + .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), + ), + }; + + Some(res) + } + None => None, + }) +} + +// FIXME(ets): remove when done with format changes /// Convert Statistics into Thrift definition. pub fn to_thrift(stats: Option<&Statistics>) -> Option { let stats = stats?; diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 9b0e8b1da698..284901e9864f 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1425,9 +1425,7 @@ mod tests { ); } - // FIXME(ets) #[test] - #[ignore = "still working on page stats"] fn test_page_writer_data_pages() { let pages = vec![ Page::DataPage { @@ -1455,9 +1453,7 @@ mod tests { test_page_roundtrip(&pages[..], Compression::UNCOMPRESSED, Type::INT32); } - // FIXME(ets) #[test] - #[ignore = "still working on page stats"] fn test_page_writer_dict_pages() { let pages = vec![ Page::DictionaryPage { @@ -1606,6 +1602,7 @@ mod tests { let props = ReaderProperties::builder() .set_backward_compatible_lz4(false) + .set_read_page_statistics(true) .build(); let mut page_reader = SerializedPageReader::new_with_properties( Arc::new(reader), From 55697572cfde259c0534395fdc1b541af3f51d4d Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 09:00:40 -0700 Subject: [PATCH 051/126] add comments --- parquet/src/file/metadata/thrift_gen.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 403f0005b3c2..c1c7f2de7569 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -856,6 +856,7 @@ pub(crate) struct DataPageHeader { ); impl DataPageHeader { + // reader that skips decoding page statistics fn read_thrift_without_stats<'a, R>(prot: &mut R) -> Result where R: ThriftCompactInputProtocol<'a>, @@ -938,6 +939,7 @@ pub(crate) struct DataPageHeaderV2 { ); impl DataPageHeaderV2 { + // reader that skips decoding page statistics fn read_thrift_without_stats<'a, R>(prot: &mut R) -> Result where R: ThriftCompactInputProtocol<'a>, @@ -1057,6 +1059,9 @@ pub(crate) struct PageHeader { ); impl PageHeader { + // reader that skips reading page statistics. obtained by running + // `cargo expand -p parquet --all-features --lib file::metadata::thrift_gen` + // and modifying the impl of `read_thrift` pub(crate) fn read_thrift_without_stats<'a, R>(prot: &mut R) -> Result where R: ThriftCompactInputProtocol<'a>, From 23636c9b3ddb14baa66511b972fe2af69a055ab8 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 09:10:01 -0700 Subject: [PATCH 052/126] clippy --- parquet/src/column/page.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 3677bc5043ad..1a8c15c457fd 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -206,7 +206,7 @@ impl CompressedPage { let page_type = self.page_type(); let mut page_header = PageHeader { - type_: page_type.into(), + type_: page_type, uncompressed_page_size: uncompressed_size as i32, compressed_page_size: compressed_size as i32, // TODO: Add support for crc checksum @@ -226,9 +226,9 @@ impl CompressedPage { } => { let data_page_header = DataPageHeader { num_values: num_values as i32, - encoding: encoding.into(), - definition_level_encoding: def_level_encoding.into(), - repetition_level_encoding: rep_level_encoding.into(), + encoding, + definition_level_encoding: def_level_encoding, + repetition_level_encoding: rep_level_encoding, statistics: page_stats_to_thrift(statistics.as_ref()), }; page_header.data_page_header = Some(data_page_header); @@ -246,7 +246,7 @@ impl CompressedPage { num_values: num_values as i32, num_nulls: num_nulls as i32, num_rows: num_rows as i32, - encoding: encoding.into(), + encoding, definition_levels_byte_length: def_levels_byte_len as i32, repetition_levels_byte_length: rep_levels_byte_len as i32, is_compressed: Some(is_compressed), @@ -257,7 +257,7 @@ impl CompressedPage { Page::DictionaryPage { is_sorted, .. } => { let dictionary_page_header = DictionaryPageHeader { num_values: num_values as i32, - encoding: encoding.into(), + encoding, is_sorted: Some(is_sorted), }; page_header.dictionary_page_header = Some(dictionary_page_header); From 179bb21ec040eda96233175388464bb22cc7d9b8 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 09:31:34 -0700 Subject: [PATCH 053/126] switch page header bench to new code --- parquet/src/thrift.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index 984ba43ec7ad..53cf5acd19dc 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -23,6 +23,8 @@ use thrift::protocol::{ TOutputProtocol, TSetIdentifier, TStructIdentifier, TType, }; +use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; + /// Reads and writes the struct to Thrift protocols. /// /// Unlike [`thrift::protocol::TSerializable`] this uses generics instead of trait objects @@ -41,8 +43,8 @@ pub fn bench_file_metadata(bytes: &bytes::Bytes) { /// Public function to aid benchmarking. Reads Parquet `PageHeader` encoded in `bytes`. pub fn bench_page_header(bytes: &bytes::Bytes) { - let mut input = TCompactSliceInputProtocol::new(bytes); - crate::format::PageHeader::read_from_in_protocol(&mut input).unwrap(); + let mut prot = ThriftReadInputProtocol::new(bytes.as_ref()); + crate::file::metadata::thrift_gen::PageHeader::read_thrift(&mut prot).unwrap(); } /// A more performant implementation of [`TCompactInputProtocol`] that reads a slice From 4f7bd62f6cc8cc8e83420c6a3423afc33b8dfbd2 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 10:17:23 -0700 Subject: [PATCH 054/126] add comment --- parquet/src/parquet_macros.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 5a3da53bcd47..358b70a1b7ff 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -447,6 +447,8 @@ macro_rules! __thrift_required_or_optional { (optional $field_type:ty) => { Option<$field_type> }; } +// Performance note: using `expect` here is about 4% faster on the page index bench, +// but we want to propogate errors. Using `ok_or` is *much* slower. #[doc(hidden)] #[macro_export] macro_rules! __thrift_result_required_or_optional { From 51cf33a9bae50808ddddfa23f2e71767f684878c Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 11:20:45 -0700 Subject: [PATCH 055/126] benchmark changes --- parquet/benches/metadata.rs | 21 +++++++++++++++++++++ parquet/src/file/metadata/thrift_gen.rs | 7 +++++++ parquet/src/thrift.rs | 14 +++++++++----- 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index 151d928957ff..85cee4baf653 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -164,6 +164,7 @@ fn rewrite_file(bytes: Bytes) -> (Bytes, FileMetaData) { .expect("parquet open"); let writer_properties = WriterProperties::builder() .set_statistics_enabled(EnabledStatistics::Page) + .set_write_page_header_statistics(true) .build(); let mut output = Vec::new(); let mut parquet_writer = ArrowWriter::try_new( @@ -246,6 +247,26 @@ fn criterion_benchmark(c: &mut Criterion) { }); }) }); + + #[cfg(feature = "arrow")] + c.bench_function("page headers (no stats)", |b| { + b.iter(|| { + metadata.row_groups.iter().for_each(|rg| { + rg.columns.iter().for_each(|col| { + if let Some(col_meta) = &col.meta_data { + if let Some(dict_offset) = col_meta.dictionary_page_offset { + parquet::thrift::bench_page_header_no_stats( + &file_bytes.slice(dict_offset as usize..), + ); + } + parquet::thrift::bench_page_header_no_stats( + &file_bytes.slice(col_meta.data_page_offset as usize..), + ); + } + }); + }); + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index c1c7f2de7569..06223bf03af8 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -117,6 +117,13 @@ pub(crate) struct FileCryptoMetaData<'a> { } ); +// expose for benchmarking +pub(crate) fn bench_file_metadata(bytes: &bytes::Bytes) { + use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; + let mut prot = ThriftSliceInputProtocol::new(bytes); + crate::file::metadata::thrift_gen::FileMetaData::read_thrift(&mut prot).unwrap(); +} + // the following are only used internally so are private thrift_struct!( struct FileMetaData<'a> { diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index 53cf5acd19dc..bb81688b92a9 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -23,8 +23,6 @@ use thrift::protocol::{ TOutputProtocol, TSetIdentifier, TStructIdentifier, TType, }; -use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; - /// Reads and writes the struct to Thrift protocols. /// /// Unlike [`thrift::protocol::TSerializable`] this uses generics instead of trait objects @@ -37,16 +35,22 @@ pub trait TSerializable: Sized { /// Public function to aid benchmarking. Reads Parquet `FileMetaData` encoded in `bytes`. pub fn bench_file_metadata(bytes: &bytes::Bytes) { - let mut input = TCompactSliceInputProtocol::new(bytes); - crate::format::FileMetaData::read_from_in_protocol(&mut input).unwrap(); + crate::file::metadata::thrift_gen::bench_file_metadata(bytes); } /// Public function to aid benchmarking. Reads Parquet `PageHeader` encoded in `bytes`. pub fn bench_page_header(bytes: &bytes::Bytes) { - let mut prot = ThriftReadInputProtocol::new(bytes.as_ref()); + use crate::parquet_thrift::ReadThrift; + let mut prot = crate::parquet_thrift::ThriftReadInputProtocol::new(bytes.as_ref()); crate::file::metadata::thrift_gen::PageHeader::read_thrift(&mut prot).unwrap(); } +/// Public function to aid benchmarking. Reads Parquet `PageHeader` encoded in `bytes`. +pub fn bench_page_header_no_stats(bytes: &bytes::Bytes) { + let mut prot = crate::parquet_thrift::ThriftReadInputProtocol::new(bytes.as_ref()); + crate::file::metadata::thrift_gen::PageHeader::read_thrift_without_stats(&mut prot).unwrap(); +} + /// A more performant implementation of [`TCompactInputProtocol`] that reads a slice /// /// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol From b4ca56e47d71436de16986617f4c9ae3044ff8e6 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 29 Aug 2025 11:39:45 -0700 Subject: [PATCH 056/126] update benchmarks to match thrift-remodel feature branch --- parquet/benches/metadata.rs | 74 +++++++++++++++++++++++++++++++++++-- parquet/src/thrift.rs | 8 +++- 2 files changed, 77 insertions(+), 5 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index 949e0d98ea39..297c4296c08a 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use parquet::file::metadata::ParquetMetaDataReader; use rand::Rng; use thrift::protocol::TCompactOutputProtocol; @@ -151,6 +152,36 @@ fn get_footer_bytes(data: Bytes) -> Bytes { data.slice(meta_start..meta_end) } +#[cfg(feature = "arrow")] +fn rewrite_file(bytes: Bytes) -> (Bytes, FileMetaData) { + use arrow::array::RecordBatchReader; + use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter}; + use parquet::file::properties::{EnabledStatistics, WriterProperties}; + + let parquet_reader = ParquetRecordBatchReaderBuilder::try_new(bytes) + .expect("parquet open") + .build() + .expect("parquet open"); + let writer_properties = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .set_write_page_header_statistics(true) + .build(); + let mut output = Vec::new(); + let mut parquet_writer = ArrowWriter::try_new( + &mut output, + parquet_reader.schema(), + Some(writer_properties), + ) + .expect("create arrow writer"); + + for maybe_batch in parquet_reader { + let batch = maybe_batch.expect("reading batch"); + parquet_writer.write(&batch).expect("writing data"); + } + let file_meta = parquet_writer.close().expect("finalizing file"); + (output.into(), file_meta) +} + fn criterion_benchmark(c: &mut Criterion) { // Read file into memory to isolate filesystem performance let file = "../parquet-testing/data/alltypes_tiny_pages.parquet"; @@ -168,19 +199,54 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - let meta_data = get_footer_bytes(data); - c.bench_function("decode file metadata", |b| { + let meta_data = get_footer_bytes(data.clone()); + c.bench_function("decode parquet metadata", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata(&meta_data).unwrap(); + }) + }); + + c.bench_function("decode thrift file metadata", |b| { b.iter(|| { parquet::thrift::bench_file_metadata(&meta_data); }) }); - let buf = black_box(encoded_meta()).into(); - c.bench_function("decode file metadata (wide)", |b| { + let buf: Bytes = black_box(encoded_meta()).into(); + c.bench_function("decode parquet metadata (wide)", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata(&buf).unwrap(); + }) + }); + + c.bench_function("decode thrift file metadata (wide)", |b| { b.iter(|| { parquet::thrift::bench_file_metadata(&buf); }) }); + + // rewrite file with page statistics. then read page headers. + #[cfg(feature = "arrow")] + let (file_bytes, metadata) = rewrite_file(data.clone()); + #[cfg(feature = "arrow")] + c.bench_function("page headers", |b| { + b.iter(|| { + metadata.row_groups.iter().for_each(|rg| { + rg.columns.iter().for_each(|col| { + if let Some(col_meta) = &col.meta_data { + if let Some(dict_offset) = col_meta.dictionary_page_offset { + parquet::thrift::bench_page_header( + &file_bytes.slice(dict_offset as usize..), + ); + } + parquet::thrift::bench_page_header( + &file_bytes.slice(col_meta.data_page_offset as usize..), + ); + } + }); + }); + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index fc391abe87d7..4fecc53c4a73 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -33,12 +33,18 @@ pub trait TSerializable: Sized { fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()>; } -/// Public function to aid benchmarking. +/// Public function to aid benchmarking. Reads Parquet `FileMetaData` encoded in `bytes`. pub fn bench_file_metadata(bytes: &bytes::Bytes) { let mut input = TCompactSliceInputProtocol::new(bytes); crate::format::FileMetaData::read_from_in_protocol(&mut input).unwrap(); } +/// Public function to aid benchmarking. Reads Parquet `PageHeader` encoded in `bytes`. +pub fn bench_page_header(bytes: &bytes::Bytes) { + let mut prot = TCompactSliceInputProtocol::new(bytes); + crate::format::PageHeader::read_from_in_protocol(&mut prot).unwrap(); +} + /// A more performant implementation of [`TCompactInputProtocol`] that reads a slice /// /// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol From c702a44b65c23a01ce7df3a9f5500bd9634e492e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sat, 30 Aug 2025 11:58:01 -0700 Subject: [PATCH 057/126] add encoding_stats to wide data set --- parquet/benches/metadata.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index 297c4296c08a..565ae96c1f6f 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -16,6 +16,7 @@ // under the License. use parquet::file::metadata::ParquetMetaDataReader; +use parquet::file::page_encoding_stats::PageEncodingStats; use rand::Rng; use thrift::protocol::TCompactOutputProtocol; @@ -26,7 +27,7 @@ use parquet::file::reader::SerializedFileReader; use parquet::file::serialized_reader::ReadOptionsBuilder; use parquet::format::{ ColumnChunk, ColumnMetaData, CompressionCodec, Encoding, FieldRepetitionType, FileMetaData, - RowGroup, SchemaElement, Type, + PageType, RowGroup, SchemaElement, Type, }; use parquet::thrift::TSerializable; @@ -94,7 +95,18 @@ fn encoded_meta() -> Vec { index_page_offset: Some(rng.random()), dictionary_page_offset: Some(rng.random()), statistics: Some(stats.clone()), - encoding_stats: None, + encoding_stats: Some(vec![ + parquet::format::PageEncodingStats { + page_type: PageType::DICTIONARY_PAGE, + encoding: Encoding::PLAIN, + count: 1, + }, + parquet::format::PageEncodingStats { + page_type: PageType::DATA_PAGE, + encoding: Encoding::RLE_DICTIONARY, + count: 10, + }, + ]), bloom_filter_offset: None, bloom_filter_length: None, size_statistics: None, From 0893ec750013d7ea3f98af46b38cdd0ef213edcd Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sat, 30 Aug 2025 12:44:26 -0700 Subject: [PATCH 058/126] clippy --- parquet/benches/metadata.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index 565ae96c1f6f..8c886e4d5eea 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -16,7 +16,6 @@ // under the License. use parquet::file::metadata::ParquetMetaDataReader; -use parquet::file::page_encoding_stats::PageEncodingStats; use rand::Rng; use thrift::protocol::TCompactOutputProtocol; @@ -27,7 +26,7 @@ use parquet::file::reader::SerializedFileReader; use parquet::file::serialized_reader::ReadOptionsBuilder; use parquet::format::{ ColumnChunk, ColumnMetaData, CompressionCodec, Encoding, FieldRepetitionType, FileMetaData, - PageType, RowGroup, SchemaElement, Type, + PageEncodingStats, PageType, RowGroup, SchemaElement, Type, }; use parquet::thrift::TSerializable; @@ -96,12 +95,12 @@ fn encoded_meta() -> Vec { dictionary_page_offset: Some(rng.random()), statistics: Some(stats.clone()), encoding_stats: Some(vec![ - parquet::format::PageEncodingStats { + PageEncodingStats { page_type: PageType::DICTIONARY_PAGE, encoding: Encoding::PLAIN, count: 1, }, - parquet::format::PageEncodingStats { + PageEncodingStats { page_type: PageType::DATA_PAGE, encoding: Encoding::RLE_DICTIONARY, count: 10, From 56f5c5d3fb983a95c5dc9f8bab57050e92620e25 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 4 Sep 2025 09:52:12 -0700 Subject: [PATCH 059/126] remove dup from merge --- parquet/src/thrift.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index 6bdfab83439e..bb81688b92a9 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -51,12 +51,6 @@ pub fn bench_page_header_no_stats(bytes: &bytes::Bytes) { crate::file::metadata::thrift_gen::PageHeader::read_thrift_without_stats(&mut prot).unwrap(); } -/// Public function to aid benchmarking. Reads Parquet `PageHeader` encoded in `bytes`. -pub fn bench_page_header(bytes: &bytes::Bytes) { - let mut prot = TCompactSliceInputProtocol::new(bytes); - crate::format::PageHeader::read_from_in_protocol(&mut prot).unwrap(); -} - /// A more performant implementation of [`TCompactInputProtocol`] that reads a slice /// /// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol From b37029e1d7816d30d59a1a6e1c06abe0135aa7f3 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Sep 2025 06:47:59 -0700 Subject: [PATCH 060/126] checkpoint offset index --- parquet/src/file/metadata/writer.rs | 81 ++++++++++++++++++--- parquet/src/file/page_index/offset_index.rs | 8 -- parquet/src/file/writer.rs | 14 +--- 3 files changed, 72 insertions(+), 31 deletions(-) diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 404bcf5dba8a..39ee1c8fe76e 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -24,8 +24,6 @@ use crate::encryption::{ }; #[cfg(feature = "encryption")] use crate::errors::ParquetError; -use crate::file::metadata::{KeyValue, ParquetMetaData}; -use crate::file::writer::{get_file_magic, TrackedWrite}; use crate::format::EncryptionAlgorithm; #[cfg(feature = "encryption")] use crate::format::{AesGcmV1, ColumnCryptoMetaData}; @@ -33,6 +31,17 @@ use crate::schema::types; use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr}; use crate::thrift::TSerializable; use crate::{errors::Result, file::page_index::column_index::ColumnIndexMetaData}; +use crate::{ + file::writer::{get_file_magic, TrackedWrite}, + parquet_thrift::WriteThrift, +}; +use crate::{ + file::{ + metadata::{KeyValue, ParquetMetaData}, + page_index::offset_index::OffsetIndexMetaData, + }, + parquet_thrift::ThriftCompactOutputProtocol, +}; use std::io::Write; use std::sync::Arc; use thrift::protocol::TCompactOutputProtocol; @@ -46,7 +55,7 @@ pub(crate) struct ThriftMetadataWriter<'a, W: Write> { schema_descr: &'a SchemaDescPtr, row_groups: Vec, column_indexes: Option<&'a [Vec>]>, - offset_indexes: Option<&'a [Vec>]>, + offset_indexes: Option<&'a [Vec>]>, key_value_metadata: Option>, created_by: Option, object_writer: MetadataObjectWriter, @@ -61,7 +70,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { /// of the serialized offset indexes. fn write_offset_indexes( &mut self, - offset_indexes: &[Vec>], + offset_indexes: &[Vec>], ) -> Result<()> { // iter row group // iter each column @@ -222,7 +231,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { pub fn with_offset_indexes( mut self, - offset_indexes: &'a [Vec>], + offset_indexes: &'a [Vec>], ) -> Self { self.offset_indexes = Some(offset_indexes); self @@ -429,14 +438,14 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { } } - fn convert_offset_index(&self) -> Vec>> { + fn convert_offset_index(&self) -> Vec>> { if let Some(row_group_offset_indexes) = self.metadata.offset_index() { (0..self.metadata.row_groups().len()) .map(|rg_idx| { let offset_indexes = &row_group_offset_indexes[rg_idx]; offset_indexes .iter() - .map(|offset_index| Some(offset_index.to_thrift())) + .map(|offset_index| Some(offset_index.clone())) .collect() }) .collect() @@ -464,6 +473,13 @@ impl MetadataObjectWriter { object.write_to_out_protocol(&mut protocol)?; Ok(()) } + + #[inline] + fn write_thrift_object(object: &impl WriteThrift, sink: impl Write) -> Result<()> { + let mut protocol = ThriftCompactOutputProtocol::new(sink); + object.write_thrift(&mut protocol)?; + Ok(()) + } } /// Implementations of [`MetadataObjectWriter`] methods for when encryption is disabled @@ -568,14 +584,14 @@ impl MetadataObjectWriter { /// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md fn write_offset_index( &self, - offset_index: &crate::format::OffsetIndex, + offset_index: &OffsetIndexMetaData, column_chunk: &crate::format::ColumnChunk, row_group_idx: usize, column_idx: usize, sink: impl Write, ) -> Result<()> { match &self.file_encryptor { - Some(file_encryptor) => Self::write_object_with_encryption( + Some(file_encryptor) => Self::write_thrift_object_with_encryption( offset_index, sink, file_encryptor, @@ -584,7 +600,7 @@ impl MetadataObjectWriter { row_group_idx, column_idx, ), - None => Self::write_object(offset_index, sink), + None => Self::write_thrift_object(offset_index, sink), } } @@ -685,6 +701,51 @@ impl MetadataObjectWriter { } } + fn write_thrift_object_with_encryption( + object: &impl WriteThrift, + mut sink: impl Write, + file_encryptor: &FileEncryptor, + column_metadata: &crate::format::ColumnChunk, + module_type: ModuleType, + row_group_index: usize, + column_index: usize, + ) -> Result<()> { + let column_path_vec = &column_metadata + .meta_data + .as_ref() + .ok_or_else(|| { + general_err!( + "Column metadata not set for column {} when encrypting object", + column_index + ) + })? + .path_in_schema; + + let joined_column_path; + let column_path = if column_path_vec.len() == 1 { + &column_path_vec[0] + } else { + joined_column_path = column_path_vec.join("."); + &joined_column_path + }; + + if file_encryptor.is_column_encrypted(column_path) { + use crate::encryption::encrypt::encrypt_thrift_object; + + let aad = create_module_aad( + file_encryptor.file_aad(), + module_type, + row_group_index, + column_index, + None, + )?; + let mut encryptor = file_encryptor.get_column_encryptor(column_path)?; + encrypt_thrift_object(object, &mut encryptor, &mut sink, &aad) + } else { + Self::write_thrift_object(object, sink) + } + } + fn get_plaintext_footer_crypto_metadata( &self, ) -> (Option, Option>) { diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 2153b8ed3009..30b58ce0acb3 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -102,14 +102,6 @@ impl OffsetIndexMetaData { self.unencoded_byte_array_data_bytes.as_ref() } - pub(crate) fn to_thrift(&self) -> crate::format::OffsetIndex { - let page_locations = self.page_locations.iter().map(|loc| loc.into()).collect(); - crate::format::OffsetIndex::new( - page_locations, - self.unencoded_byte_array_data_bytes.clone(), - ) - } - // Fast-path read of offset index. This works because we expect all field deltas to be 1, // and there's no nesting beyond PageLocation, so no need to save the last field id. Like // read_page_locations(), this will fail if absolute field id's are used. diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 284901e9864f..7a4706964156 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -340,7 +340,6 @@ impl SerializedFileWriter { .collect::>(); let column_indexes = self.convert_column_indexes(); - let offset_indexes = self.convert_offset_index(); let mut encoder = ThriftMetadataWriter::new( &mut self.buf, @@ -361,7 +360,7 @@ impl SerializedFileWriter { } encoder = encoder.with_column_indexes(&column_indexes); - encoder = encoder.with_offset_indexes(&offset_indexes); + encoder = encoder.with_offset_indexes(&self.offset_indexes); encoder.finish() } @@ -388,17 +387,6 @@ impl SerializedFileWriter { .collect() } - fn convert_offset_index(&self) -> Vec>> { - self.offset_indexes - .iter() - .map(|ois| { - ois.iter() - .map(|oi| oi.as_ref().map(|offset_index| offset_index.to_thrift())) - .collect() - }) - .collect() - } - #[inline] fn assert_previous_writer_closed(&self) -> Result<()> { if self.finished { From 086d04c604c672c3a1a6886f35b21dad95027aa5 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Sep 2025 10:12:55 -0700 Subject: [PATCH 061/126] write path for column index --- parquet/src/column/writer/mod.rs | 76 ++++--- parquet/src/file/metadata/mod.rs | 213 ++++++++++---------- parquet/src/file/metadata/writer.rs | 93 ++------- parquet/src/file/page_index/column_index.rs | 176 ++++++++++------ parquet/src/file/page_index/index.rs | 1 + parquet/src/file/writer.rs | 35 +--- 6 files changed, 280 insertions(+), 314 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index a9ce2eb57212..4d5a7cfa5e5c 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -21,7 +21,7 @@ use bytes::Bytes; use half::f16; use crate::bloom_filter::Sbbf; -use crate::file::page_index::index::Index; +use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::page_index::offset_index::OffsetIndexMetaData; use std::collections::{BTreeSet, VecDeque}; use std::str; @@ -188,7 +188,7 @@ pub struct ColumnCloseResult { /// Optional bloom filter for this column pub bloom_filter: Option, /// Optional column index, for filtering - pub column_index: Option, + pub column_index: Option, /// Optional offset index, identifying page locations pub offset_index: Option, } @@ -2944,28 +2944,22 @@ mod tests { assert!(r.column_index.is_some()); let col_idx = r.column_index.unwrap(); let col_idx = match col_idx { - Index::INT32(col_idx) => col_idx, + ColumnIndexMetaData::INT32(col_idx) => col_idx, _ => panic!("wrong stats type"), }; // null_pages should be true for page 0 - assert!(col_idx.indexes[0].is_null_page()); + assert!(col_idx.is_null_page(0)); // min and max should be empty byte arrays - assert!(col_idx.indexes[0].min().is_none()); - assert!(col_idx.indexes[0].max().is_none()); + assert!(col_idx.min_value(0).is_none()); + assert!(col_idx.max_value(0).is_none()); // null_counts should be defined and be 4 for page 0 - assert!(col_idx.indexes[0].null_count().is_some()); - assert_eq!(col_idx.indexes[0].null_count().unwrap(), 4); + assert!(col_idx.null_count(0).is_some()); + assert_eq!(col_idx.null_count(0), Some(4)); // there is no repetition so rep histogram should be absent - assert!(col_idx.indexes[0].repetition_level_histogram().is_none()); + assert!(col_idx.repetition_level_histogram(0).is_none()); // definition_level_histogram should be present and should be 0:4, 1:0 - assert!(col_idx.indexes[0].definition_level_histogram().is_some()); - assert_eq!( - col_idx.indexes[0] - .definition_level_histogram() - .unwrap() - .values(), - &[4, 0] - ); + assert!(col_idx.definition_level_histogram(0).is_some()); + assert_eq!(col_idx.definition_level_histogram(0).unwrap(), &[4, 0]); } #[test] @@ -2989,15 +2983,15 @@ mod tests { // column index let column_index = match column_index { - Index::INT32(column_index) => column_index, + ColumnIndexMetaData::INT32(column_index) => column_index, _ => panic!("wrong stats type"), }; - assert_eq!(2, column_index.indexes.len()); + assert_eq!(2, column_index.num_pages()); assert_eq!(2, offset_index.page_locations.len()); assert_eq!(BoundaryOrder::UNORDERED, column_index.boundary_order); for idx in 0..2 { - assert!(!column_index.indexes[idx].is_null_page()); - assert_eq!(0, *column_index.indexes[idx].null_count.as_ref().unwrap()); + assert!(!column_index.is_null_page(idx)); + assert_eq!(0, column_index.null_count(0).unwrap()); } if let Some(stats) = r.metadata.statistics() { @@ -3007,8 +3001,8 @@ mod tests { // first page is [1,2,3,4] // second page is [-5,2,4,8] // note that we don't increment here, as this is a non BinaryArray type. - assert_eq!(stats.min_opt(), column_index.indexes[1].min()); - assert_eq!(stats.max_opt(), column_index.indexes[1].max()); + assert_eq!(stats.min_opt(), column_index.min_value(1)); + assert_eq!(stats.max_opt(), column_index.max_value(1)); } else { panic!("expecting Statistics::Int32"); } @@ -3049,25 +3043,25 @@ mod tests { let offset_index = r.offset_index.unwrap(); let column_index = match column_index { - Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index, + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index, _ => panic!("wrong stats type"), }; assert_eq!(3, r.rows_written); // column index - assert_eq!(1, column_index.indexes.len()); + assert_eq!(1, column_index.num_pages()); assert_eq!(1, offset_index.page_locations.len()); assert_eq!(BoundaryOrder::ASCENDING, column_index.boundary_order); - assert!(!column_index.indexes[0].is_null_page()); - assert_eq!(Some(0), column_index.indexes[0].null_count()); + assert!(!column_index.is_null_page(0)); + assert_eq!(Some(0), column_index.null_count(0)); if let Some(stats) = r.metadata.statistics() { assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count_opt(), None); if let Statistics::FixedLenByteArray(stats) = stats { - let column_index_min_value = column_index.indexes[0].min_bytes().unwrap(); - let column_index_max_value = column_index.indexes[0].max_bytes().unwrap(); + let column_index_min_value = column_index.min_value(0).unwrap(); + let column_index_max_value = column_index.max_value(0).unwrap(); // Column index stats are truncated, while the column chunk's aren't. assert_ne!(stats.min_bytes_opt().unwrap(), column_index_min_value); @@ -3120,25 +3114,25 @@ mod tests { let offset_index = r.offset_index.unwrap(); let column_index = match column_index { - Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index, + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index, _ => panic!("wrong stats type"), }; assert_eq!(1, r.rows_written); // column index - assert_eq!(1, column_index.indexes.len()); + assert_eq!(1, column_index.num_pages()); assert_eq!(1, offset_index.page_locations.len()); assert_eq!(BoundaryOrder::ASCENDING, column_index.boundary_order); - assert!(!column_index.indexes[0].is_null_page()); - assert_eq!(Some(0), column_index.indexes[0].null_count()); + assert!(!column_index.is_null_page(0)); + assert_eq!(Some(0), column_index.null_count(0)); if let Some(stats) = r.metadata.statistics() { assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count_opt(), None); if let Statistics::FixedLenByteArray(_stats) = stats { - let column_index_min_value = column_index.indexes[0].min_bytes().unwrap(); - let column_index_max_value = column_index.indexes[0].max_bytes().unwrap(); + let column_index_min_value = column_index.min_value(0).unwrap(); + let column_index_max_value = column_index.max_value(0).unwrap(); assert_eq!(column_index_min_value.len(), 1); assert_eq!(column_index_max_value.len(), 1); @@ -3175,11 +3169,11 @@ mod tests { // ensure bytes weren't truncated for column index let column_index = r.column_index.unwrap(); let column_index = match column_index { - Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index, + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index, _ => panic!("wrong stats type"), }; - let column_index_min_bytes = column_index.indexes[0].min_bytes().unwrap(); - let column_index_max_bytes = column_index.indexes[0].min_bytes().unwrap(); + let column_index_min_bytes = column_index.min_value(0).unwrap(); + let column_index_max_bytes = column_index.max_value(0).unwrap(); assert_eq!(expected_value, column_index_min_bytes); assert_eq!(expected_value, column_index_max_bytes); @@ -3218,11 +3212,11 @@ mod tests { // ensure bytes weren't truncated for column index let column_index = r.column_index.unwrap(); let column_index = match column_index { - Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index, + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => column_index, _ => panic!("wrong stats type"), }; - let column_index_min_bytes = column_index.indexes[0].min_bytes().unwrap(); - let column_index_max_bytes = column_index.indexes[0].min_bytes().unwrap(); + let column_index_min_bytes = column_index.min_value(0).unwrap(); + let column_index_max_bytes = column_index.max_value(0).unwrap(); assert_eq!(expected_value, column_index_min_bytes); assert_eq!(expected_value, column_index_max_bytes); diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 6f3a842d0985..47f805641470 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -101,19 +101,20 @@ use crate::encryption::decrypt::FileDecryptor; #[cfg(feature = "encryption")] use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData}; pub(crate) use crate::file::metadata::memory::HeapSize; +use crate::file::page_index::column_index::{ + ByteArrayColumnIndex, ColumnIndex, PrimitiveColumnIndex, +}; +use crate::file::statistics::{self, Statistics}; use crate::file::{ page_encoding_stats::{self, PageEncodingStats}, page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation}, }; -use crate::file::{ - page_index::index::PageIndex, - statistics::{self, Statistics}, -}; use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, Type as SchemaType, }; +use crate::thrift_struct; use crate::{ basic::BoundaryOrder, errors::{ParquetError, Result}, @@ -128,10 +129,6 @@ use crate::{ use crate::{ data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData, }; -use crate::{ - file::page_index::index::{Index, NativeIndex}, - thrift_struct, -}; pub use reader::{FooterTail, PageIndexPolicy, ParquetMetaDataReader}; use std::io::Write; use std::ops::Range; @@ -1630,135 +1627,141 @@ impl ColumnIndexBuilder { /// Build and get the column index /// /// Note: callers should check [`Self::valid`] before calling this method - pub fn build(self) -> Result { + pub fn build(self) -> Result { Ok(match self.column_type { Type::BOOLEAN => { - let (indexes, boundary_order) = self.build_page_index()?; - Index::BOOLEAN(NativeIndex { - indexes, - boundary_order, - }) + let index = self.build_page_index()?; + ColumnIndexMetaData::BOOLEAN(index) } Type::INT32 => { - let (indexes, boundary_order) = self.build_page_index()?; - Index::INT32(NativeIndex { - indexes, - boundary_order, - }) + let index = self.build_page_index()?; + ColumnIndexMetaData::INT32(index) } Type::INT64 => { - let (indexes, boundary_order) = self.build_page_index()?; - Index::INT64(NativeIndex { - indexes, - boundary_order, - }) + let index = self.build_page_index()?; + ColumnIndexMetaData::INT64(index) } Type::INT96 => { - let (indexes, boundary_order) = self.build_page_index()?; - Index::INT96(NativeIndex { - indexes, - boundary_order, - }) + let index = self.build_page_index()?; + ColumnIndexMetaData::INT96(index) } Type::FLOAT => { - let (indexes, boundary_order) = self.build_page_index()?; - Index::FLOAT(NativeIndex { - indexes, - boundary_order, - }) + let index = self.build_page_index()?; + ColumnIndexMetaData::FLOAT(index) } Type::DOUBLE => { - let (indexes, boundary_order) = self.build_page_index()?; - Index::DOUBLE(NativeIndex { - indexes, - boundary_order, - }) + let index = self.build_page_index()?; + ColumnIndexMetaData::DOUBLE(index) } Type::BYTE_ARRAY => { - let (indexes, boundary_order) = self.build_page_index()?; - Index::BYTE_ARRAY(NativeIndex { - indexes, - boundary_order, - }) + let index = self.build_byte_array_index()?; + ColumnIndexMetaData::BYTE_ARRAY(index) } Type::FIXED_LEN_BYTE_ARRAY => { - let (indexes, boundary_order) = self.build_page_index()?; - Index::FIXED_LEN_BYTE_ARRAY(NativeIndex { - indexes, - boundary_order, - }) + let index = self.build_byte_array_index()?; + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) } }) } - fn build_page_index(self) -> Result<(Vec>, BoundaryOrder)> + fn build_page_index(self) -> Result> where T: ParquetValueType, { - let len = self.min_values.len(); + let column_index = ColumnIndex { + null_pages: self.null_pages, + boundary_order: self.boundary_order, + null_counts: Some(self.null_counts), + repetition_level_histograms: self.repetition_level_histograms, + definition_level_histograms: self.definition_level_histograms, + }; - let null_counts = self - .null_counts + // TODO(ets): refactor this into column index + let min_values = self + .min_values .iter() - .map(|x| Some(*x)) - .collect::>(); + .zip(column_index.null_pages.iter()) + .map(|(min, is_null)| { + if *is_null { + Ok(Default::default()) + } else { + Ok(T::try_from_le_slice(min)?) + } + }) + .collect::, ParquetError>>()?; - // histograms are a 1D array encoding a 2D num_pages X num_levels matrix. - let to_page_histograms = |opt_hist: Option>| { - if let Some(hist) = opt_hist { - // TODO: should we assert (hist.len() % len) == 0? - let num_levels = hist.len() / len; - let mut res = Vec::with_capacity(len); - for i in 0..len { - let page_idx = i * num_levels; - let page_hist = hist[page_idx..page_idx + num_levels].to_vec(); - res.push(Some(LevelHistogram::from(page_hist))); + let max_values = self + .max_values + .iter() + .zip(column_index.null_pages.iter()) + .map(|(max, is_null)| { + if *is_null { + Ok(Default::default()) + } else { + Ok(T::try_from_le_slice(max)?) } - res + }) + .collect::, ParquetError>>()?; + + Ok(PrimitiveColumnIndex { + column_index, + min_values, + max_values, + }) + } + + fn build_byte_array_index(self) -> Result { + // TODO(ets): refactor this into column index so we can reuse all this code + let len = self.null_pages.len(); + + let min_len = self.min_values.iter().map(|v| v.len()).sum(); + let max_len = self.max_values.iter().map(|v| v.len()).sum(); + let mut min_bytes = vec![0u8; min_len]; + let mut max_bytes = vec![0u8; max_len]; + + let mut min_offsets = vec![0usize; len + 1]; + let mut max_offsets = vec![0usize; len + 1]; + + let mut min_pos = 0; + let mut max_pos = 0; + + for (i, is_null) in self.null_pages.iter().enumerate().take(len) { + if !is_null { + let min = &self.min_values[i]; + let dst = &mut min_bytes[min_pos..min_pos + min.len()]; + dst.copy_from_slice(min); + min_offsets[i] = min_pos; + min_pos += min.len(); + + let max = &self.max_values[i]; + let dst = &mut max_bytes[max_pos..max_pos + max.len()]; + dst.copy_from_slice(max); + max_offsets[i] = max_pos; + max_pos += max.len(); } else { - vec![None; len] + min_offsets[i] = min_pos; + max_offsets[i] = max_pos; } - }; + } - let rep_hists: Vec> = - to_page_histograms(self.repetition_level_histograms); - let def_hists: Vec> = - to_page_histograms(self.definition_level_histograms); + min_offsets[len] = min_pos; + max_offsets[len] = max_pos; - let indexes = self - .min_values - .iter() - .zip(self.max_values.iter()) - .zip(self.null_pages.into_iter()) - .zip(null_counts.into_iter()) - .zip(rep_hists.into_iter()) - .zip(def_hists.into_iter()) - .map( - |( - ((((min, max), is_null), null_count), repetition_level_histogram), - definition_level_histogram, - )| { - let (min, max) = if is_null { - (None, None) - } else { - ( - Some(T::try_from_le_slice(min)?), - Some(T::try_from_le_slice(max)?), - ) - }; - Ok(PageIndex { - min, - max, - null_count, - repetition_level_histogram, - definition_level_histogram, - }) - }, - ) - .collect::, ParquetError>>()?; + let column_index = ColumnIndex { + null_pages: self.null_pages, + boundary_order: self.boundary_order, + null_counts: Some(self.null_counts), + repetition_level_histograms: self.repetition_level_histograms, + definition_level_histograms: self.definition_level_histograms, + }; - let boundary_order = self.boundary_order; - Ok((indexes, boundary_order)) + Ok(ByteArrayColumnIndex { + column_index, + min_bytes, + min_offsets, + max_bytes, + max_offsets, + }) } } diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 39ee1c8fe76e..a09a703adef8 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -54,7 +54,7 @@ pub(crate) struct ThriftMetadataWriter<'a, W: Write> { schema: &'a TypePtr, schema_descr: &'a SchemaDescPtr, row_groups: Vec, - column_indexes: Option<&'a [Vec>]>, + column_indexes: Option<&'a [Vec>]>, offset_indexes: Option<&'a [Vec>]>, key_value_metadata: Option>, created_by: Option, @@ -103,7 +103,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { /// of the serialized column indexes. fn write_column_indexes( &mut self, - column_indexes: &[Vec>], + column_indexes: &[Vec>], ) -> Result<()> { // iter row group // iter each column @@ -223,7 +223,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { pub fn with_column_indexes( mut self, - column_indexes: &'a [Vec>], + column_indexes: &'a [Vec>], ) -> Self { self.column_indexes = Some(column_indexes); self @@ -391,40 +391,14 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { Ok(()) } - fn convert_column_indexes(&self) -> Vec>> { + fn convert_column_indexes(&self) -> Vec>> { if let Some(row_group_column_indexes) = self.metadata.column_index() { (0..self.metadata.row_groups().len()) .map(|rg_idx| { let column_indexes = &row_group_column_indexes[rg_idx]; column_indexes .iter() - .map(|column_index| match column_index { - ColumnIndexMetaData::NONE => None, - ColumnIndexMetaData::BOOLEAN(column_index) => { - Some(column_index.to_thrift()) - } - ColumnIndexMetaData::BYTE_ARRAY(column_index) => { - Some(column_index.to_thrift()) - } - ColumnIndexMetaData::DOUBLE(column_index) => { - Some(column_index.to_thrift()) - } - ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => { - Some(column_index.to_thrift()) - } - ColumnIndexMetaData::FLOAT(column_index) => { - Some(column_index.to_thrift()) - } - ColumnIndexMetaData::INT32(column_index) => { - Some(column_index.to_thrift()) - } - ColumnIndexMetaData::INT64(column_index) => { - Some(column_index.to_thrift()) - } - ColumnIndexMetaData::INT96(column_index) => { - Some(column_index.to_thrift()) - } - }) + .map(|column_index| Some(column_index.clone())) .collect() }) .collect() @@ -497,25 +471,25 @@ impl MetadataObjectWriter { /// Write a column [`OffsetIndex`] in Thrift format fn write_offset_index( &self, - offset_index: &crate::format::OffsetIndex, + offset_index: &OffsetIndexMetaData, _column_chunk: &crate::format::ColumnChunk, _row_group_idx: usize, _column_idx: usize, sink: impl Write, ) -> Result<()> { - Self::write_object(offset_index, sink) + Self::write_thrift_object(offset_index, sink) } /// Write a column [`ColumnIndex`] in Thrift format fn write_column_index( &self, - column_index: &crate::format::ColumnIndex, + column_index: &ColumnIndexMetaData, _column_chunk: &crate::format::ColumnChunk, _row_group_idx: usize, _column_idx: usize, sink: impl Write, ) -> Result<()> { - Self::write_object(column_index, sink) + Self::write_thrift_object(column_index, sink) } /// No-op implementation of row-group metadata encryption @@ -609,14 +583,14 @@ impl MetadataObjectWriter { /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md fn write_column_index( &self, - column_index: &crate::format::ColumnIndex, + column_index: &ColumnIndexMetaData, column_chunk: &crate::format::ColumnChunk, row_group_idx: usize, column_idx: usize, sink: impl Write, ) -> Result<()> { match &self.file_encryptor { - Some(file_encryptor) => Self::write_object_with_encryption( + Some(file_encryptor) => Self::write_thrift_object_with_encryption( column_index, sink, file_encryptor, @@ -625,7 +599,7 @@ impl MetadataObjectWriter { row_group_idx, column_idx, ), - None => Self::write_object(column_index, sink), + None => Self::write_thrift_object(column_index, sink), } } @@ -658,49 +632,6 @@ impl MetadataObjectWriter { ) } - fn write_object_with_encryption( - object: &impl TSerializable, - mut sink: impl Write, - file_encryptor: &FileEncryptor, - column_metadata: &crate::format::ColumnChunk, - module_type: ModuleType, - row_group_index: usize, - column_index: usize, - ) -> Result<()> { - let column_path_vec = &column_metadata - .meta_data - .as_ref() - .ok_or_else(|| { - general_err!( - "Column metadata not set for column {} when encrypting object", - column_index - ) - })? - .path_in_schema; - - let joined_column_path; - let column_path = if column_path_vec.len() == 1 { - &column_path_vec[0] - } else { - joined_column_path = column_path_vec.join("."); - &joined_column_path - }; - - if file_encryptor.is_column_encrypted(column_path) { - let aad = create_module_aad( - file_encryptor.file_aad(), - module_type, - row_group_index, - column_index, - None, - )?; - let mut encryptor = file_encryptor.get_column_encryptor(column_path)?; - encrypt_object(object, &mut encryptor, &mut sink, &aad) - } else { - Self::write_object(object, sink) - } - } - fn write_thrift_object_with_encryption( object: &impl WriteThrift, mut sink: impl Write, diff --git a/parquet/src/file/page_index/column_index.rs b/parquet/src/file/page_index/column_index.rs index 2d43c93b2e4b..77d922861687 100644 --- a/parquet/src/file/page_index/column_index.rs +++ b/parquet/src/file/page_index/column_index.rs @@ -22,7 +22,10 @@ use crate::{ data_type::{ByteArray, FixedLenByteArray}, - errors::Result, + errors::{ParquetError, Result}, + parquet_thrift::{ + ElementType, FieldType, ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, + }, }; use std::ops::Deref; @@ -124,35 +127,6 @@ impl PrimitiveColumnIndex { max_values, }) } - - pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { - let min_values = self - .min_values - .iter() - .map(|x| x.as_bytes().to_vec()) - .collect::>(); - - let max_values = self - .max_values - .iter() - .map(|x| x.as_bytes().to_vec()) - .collect::>(); - - let null_counts = self.null_counts.clone(); - let repetition_level_histograms = self.repetition_level_histograms.clone(); - let definition_level_histograms = self.definition_level_histograms.clone(); - let null_pages = self.null_pages.clone(); - - crate::format::ColumnIndex::new( - null_pages, - min_values, - max_values, - self.boundary_order.into(), - null_counts, - repetition_level_histograms, - definition_level_histograms, - ) - } } impl PrimitiveColumnIndex { @@ -229,6 +203,53 @@ impl Deref for PrimitiveColumnIndex { } } +impl WriteThrift for PrimitiveColumnIndex { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + fn write_thrift( + &self, + writer: &mut ThriftCompactOutputProtocol, + ) -> Result<()> { + self.null_pages.write_thrift_field(writer, 1, 0)?; + + // need to handle min/max manually + let len = self.null_pages.len(); + writer.write_field_begin(FieldType::List, 2, 1)?; + writer.write_list_begin(ElementType::Binary, len)?; + for i in 0..len { + let min = self.min_value(i).map(|m| m.as_bytes()).unwrap_or(&[]); + min.write_thrift(writer)?; + } + writer.write_field_begin(FieldType::List, 3, 2)?; + writer.write_list_begin(ElementType::Binary, len)?; + for i in 0..len { + let max = self.max_value(i).map(|m| m.as_bytes()).unwrap_or(&[]); + max.write_thrift(writer)?; + } + let mut last_field_id = self.boundary_order.write_thrift_field(writer, 4, 3)?; + if self.null_counts.is_some() { + last_field_id = + self.null_counts + .as_ref() + .unwrap() + .write_thrift_field(writer, 5, last_field_id)?; + } + if self.repetition_level_histograms.is_some() { + last_field_id = self + .repetition_level_histograms + .as_ref() + .unwrap() + .write_thrift_field(writer, 6, last_field_id)?; + } + if self.definition_level_histograms.is_some() { + self.definition_level_histograms + .as_ref() + .unwrap() + .write_thrift_field(writer, 7, last_field_id)?; + } + writer.write_struct_end() + } +} + /// Column index for byte arrays (fixed length and variable) #[derive(Debug, Clone, PartialEq)] pub struct ByteArrayColumnIndex { @@ -344,33 +365,6 @@ impl ByteArrayColumnIndex { } }) } - - pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { - let mut min_values = Vec::with_capacity(self.num_pages() as usize); - for i in 0..self.num_pages() as usize { - min_values.push(self.min_value(i).unwrap_or(&[]).to_owned()); - } - - let mut max_values = Vec::with_capacity(self.num_pages() as usize); - for i in 0..self.num_pages() as usize { - max_values.push(self.max_value(i).unwrap_or(&[]).to_owned()); - } - - let null_counts = self.null_counts.clone(); - let repetition_level_histograms = self.repetition_level_histograms.clone(); - let definition_level_histograms = self.definition_level_histograms.clone(); - let null_pages = self.null_pages.clone(); - - crate::format::ColumnIndex::new( - null_pages, - min_values, - max_values, - self.boundary_order.into(), - null_counts, - repetition_level_histograms, - definition_level_histograms, - ) - } } impl Deref for ByteArrayColumnIndex { @@ -381,6 +375,53 @@ impl Deref for ByteArrayColumnIndex { } } +impl WriteThrift for ByteArrayColumnIndex { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + fn write_thrift( + &self, + writer: &mut ThriftCompactOutputProtocol, + ) -> Result<()> { + self.null_pages.write_thrift_field(writer, 1, 0)?; + + // need to handle min/max manually + let len = self.null_pages.len(); + writer.write_field_begin(FieldType::List, 2, 1)?; + writer.write_list_begin(ElementType::Binary, len)?; + for i in 0..len { + let min = self.min_value(i).unwrap_or(&[]); + min.write_thrift(writer)?; + } + writer.write_field_begin(FieldType::List, 3, 2)?; + writer.write_list_begin(ElementType::Binary, len)?; + for i in 0..len { + let max = self.max_value(i).unwrap_or(&[]); + max.write_thrift(writer)?; + } + let mut last_field_id = self.boundary_order.write_thrift_field(writer, 4, 3)?; + if self.null_counts.is_some() { + last_field_id = + self.null_counts + .as_ref() + .unwrap() + .write_thrift_field(writer, 5, last_field_id)?; + } + if self.repetition_level_histograms.is_some() { + last_field_id = self + .repetition_level_histograms + .as_ref() + .unwrap() + .write_thrift_field(writer, 6, last_field_id)?; + } + if self.definition_level_histograms.is_some() { + self.definition_level_histograms + .as_ref() + .unwrap() + .write_thrift_field(writer, 7, last_field_id)?; + } + writer.write_struct_end() + } +} + // Macro to generate getter functions for ColumnIndexMetaData. macro_rules! colidx_enum_func { ($self:ident, $func:ident, $arg:ident) => {{ @@ -567,3 +608,24 @@ column_index_iters!(ByteArray, BYTE_ARRAY, |v| v .map(|v| ByteArray::from(v.to_owned()))); column_index_iters!(FixedLenByteArray, FIXED_LEN_BYTE_ARRAY, |v| v .map(|v| FixedLenByteArray::from(v.to_owned()))); + +impl WriteThrift for ColumnIndexMetaData { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + fn write_thrift( + &self, + writer: &mut ThriftCompactOutputProtocol, + ) -> Result<()> { + match self { + ColumnIndexMetaData::BOOLEAN(index) => index.write_thrift(writer), + ColumnIndexMetaData::INT32(index) => index.write_thrift(writer), + ColumnIndexMetaData::INT64(index) => index.write_thrift(writer), + ColumnIndexMetaData::INT96(index) => index.write_thrift(writer), + ColumnIndexMetaData::FLOAT(index) => index.write_thrift(writer), + ColumnIndexMetaData::DOUBLE(index) => index.write_thrift(writer), + ColumnIndexMetaData::BYTE_ARRAY(index) => index.write_thrift(writer), + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) => index.write_thrift(writer), + _ => Err(general_err!("Cannot serialize NONE index")), + } + } +} diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 861dc0c3b04e..abc0bf1b8e62 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -264,6 +264,7 @@ impl NativeIndex { }) } + #[allow(dead_code)] pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { let min_values = self .indexes diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 7a4706964156..985903cad1b0 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -20,7 +20,7 @@ use crate::bloom_filter::Sbbf; use crate::file::metadata::thrift_gen::PageHeader; -use crate::file::page_index::index::Index; +use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::parquet_thrift::{ThriftCompactOutputProtocol, WriteThrift}; use std::fmt::Debug; @@ -128,7 +128,7 @@ pub type OnCloseRowGroup<'a, W> = Box< &'a mut TrackedWrite, RowGroupMetaData, Vec>, - Vec>, + Vec>, Vec>, ) -> Result<()> + 'a @@ -154,7 +154,7 @@ pub struct SerializedFileWriter { props: WriterPropertiesPtr, row_groups: Vec, bloom_filters: Vec>>, - column_indexes: Vec>>, + column_indexes: Vec>>, offset_indexes: Vec>>, row_group_index: usize, // kv_metadatas will be appended to `props` when `write_metadata` @@ -339,8 +339,6 @@ impl SerializedFileWriter { .map(|v| v.to_thrift()) .collect::>(); - let column_indexes = self.convert_column_indexes(); - let mut encoder = ThriftMetadataWriter::new( &mut self.buf, &self.schema, @@ -359,34 +357,11 @@ impl SerializedFileWriter { encoder = encoder.with_key_value_metadata(key_value_metadata) } - encoder = encoder.with_column_indexes(&column_indexes); + encoder = encoder.with_column_indexes(&self.column_indexes); encoder = encoder.with_offset_indexes(&self.offset_indexes); encoder.finish() } - fn convert_column_indexes(&self) -> Vec>> { - self.column_indexes - .iter() - .map(|cis| { - cis.iter() - .map(|ci| { - ci.as_ref().map(|column_index| match column_index { - Index::NONE => panic!("trying to serialize missing column index"), - Index::BOOLEAN(column_index) => column_index.to_thrift(), - Index::BYTE_ARRAY(column_index) => column_index.to_thrift(), - Index::DOUBLE(column_index) => column_index.to_thrift(), - Index::FIXED_LEN_BYTE_ARRAY(column_index) => column_index.to_thrift(), - Index::FLOAT(column_index) => column_index.to_thrift(), - Index::INT32(column_index) => column_index.to_thrift(), - Index::INT64(column_index) => column_index.to_thrift(), - Index::INT96(column_index) => column_index.to_thrift(), - }) - }) - .collect() - }) - .collect() - } - #[inline] fn assert_previous_writer_closed(&self) -> Result<()> { if self.finished { @@ -525,7 +500,7 @@ pub struct SerializedRowGroupWriter<'a, W: Write> { row_group_metadata: Option, column_chunks: Vec, bloom_filters: Vec>, - column_indexes: Vec>, + column_indexes: Vec>, offset_indexes: Vec>, row_group_index: i16, file_offset: i64, From ecd24de2eebeb34caf4332e5e983aa1c68404c83 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Sep 2025 10:34:26 -0700 Subject: [PATCH 062/126] copy over tests from index --- parquet/src/file/page_index/column_index.rs | 75 +++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/parquet/src/file/page_index/column_index.rs b/parquet/src/file/page_index/column_index.rs index 77d922861687..549d1bbd0c66 100644 --- a/parquet/src/file/page_index/column_index.rs +++ b/parquet/src/file/page_index/column_index.rs @@ -629,3 +629,78 @@ impl WriteThrift for ColumnIndexMetaData { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_page_index_min_max_null() { + let column_index = PrimitiveColumnIndex { + column_index: ColumnIndex { + null_pages: vec![false], + boundary_order: BoundaryOrder::ASCENDING, + null_counts: Some(vec![0]), + repetition_level_histograms: Some(vec![1, 2]), + definition_level_histograms: Some(vec![1, 2, 3]), + }, + min_values: vec![-123], + max_values: vec![234], + }; + + assert_eq!(column_index.min_value(0), Some(&-123)); + assert_eq!(column_index.max_value(0), Some(&234)); + assert_eq!(column_index.null_count(0), Some(0)); + assert_eq!(column_index.repetition_level_histogram(0).unwrap(), &[1, 2]); + assert_eq!( + column_index.definition_level_histogram(0).unwrap(), + &[1, 2, 3] + ); + } + + #[test] + fn test_page_index_min_max_null_none() { + let column_index: PrimitiveColumnIndex = PrimitiveColumnIndex:: { + column_index: ColumnIndex { + null_pages: vec![true], + boundary_order: BoundaryOrder::ASCENDING, + null_counts: Some(vec![1]), + repetition_level_histograms: None, + definition_level_histograms: Some(vec![1, 0]), + }, + min_values: vec![Default::default()], + max_values: vec![Default::default()], + }; + + assert_eq!(column_index.min_value(0), None); + assert_eq!(column_index.max_value(0), None); + assert_eq!(column_index.null_count(0), Some(1)); + assert_eq!(column_index.repetition_level_histogram(0), None); + assert_eq!(column_index.definition_level_histogram(0).unwrap(), &[1, 0]); + } + + #[test] + fn test_invalid_column_index() { + let column_index = ThriftColumnIndex { + null_pages: vec![true, false], + min_values: vec![ + &[], + &[], // this shouldn't be empty as null_pages[1] is false + ], + max_values: vec![ + &[], + &[], // this shouldn't be empty as null_pages[1] is false + ], + null_counts: None, + repetition_level_histograms: None, + definition_level_histograms: None, + boundary_order: BoundaryOrder::UNORDERED, + }; + + let err = PrimitiveColumnIndex::::try_new(column_index).unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: error converting value, expected 4 bytes got 0" + ); + } +} From 1e510bcd3e1f73aa868f27107dad79692a7b2552 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Sep 2025 10:52:19 -0700 Subject: [PATCH 063/126] remove index module --- parquet/src/file/metadata/memory.rs | 29 -- parquet/src/file/page_index/index.rs | 456 --------------------------- parquet/src/file/page_index/mod.rs | 1 - 3 files changed, 486 deletions(-) delete mode 100644 parquet/src/file/page_index/index.rs diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 69eee3c2999d..19122a1b5522 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -27,7 +27,6 @@ use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::page_index::column_index::{ ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex, }; -use crate::file::page_index::index::{Index, NativeIndex, PageIndex}; use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use crate::file::statistics::{Statistics, ValueStatistics}; use std::sync::Arc; @@ -199,34 +198,6 @@ impl HeapSize for ByteArrayColumnIndex { } } -impl HeapSize for Index { - fn heap_size(&self) -> usize { - match self { - Index::NONE => 0, - Index::BOOLEAN(native_index) => native_index.heap_size(), - Index::INT32(native_index) => native_index.heap_size(), - Index::INT64(native_index) => native_index.heap_size(), - Index::INT96(native_index) => native_index.heap_size(), - Index::FLOAT(native_index) => native_index.heap_size(), - Index::DOUBLE(native_index) => native_index.heap_size(), - Index::BYTE_ARRAY(native_index) => native_index.heap_size(), - Index::FIXED_LEN_BYTE_ARRAY(native_index) => native_index.heap_size(), - } - } -} - -impl HeapSize for NativeIndex { - fn heap_size(&self) -> usize { - self.indexes.heap_size() + self.boundary_order.heap_size() - } -} - -impl HeapSize for PageIndex { - fn heap_size(&self) -> usize { - self.min.heap_size() + self.max.heap_size() + self.null_count.heap_size() - } -} - impl HeapSize for ValueStatistics { fn heap_size(&self) -> usize { self.min_opt().map(T::heap_size).unwrap_or(0) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs deleted file mode 100644 index abc0bf1b8e62..000000000000 --- a/parquet/src/file/page_index/index.rs +++ /dev/null @@ -1,456 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! [`Index`] structures holding decoded [`ColumnIndex`] information -//! -//! [`ColumnIndex`]: crate::format::ColumnIndex - -use crate::basic::{BoundaryOrder, Type}; -use crate::data_type::private::ParquetValueType; -use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96}; -use crate::errors::ParquetError; -use crate::file::metadata::LevelHistogram; -use crate::file::page_index::index_reader::ThriftColumnIndex; -use std::fmt::Debug; - -/// Typed statistics for one data page -/// -/// See [`NativeIndex`] for more details -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct PageIndex { - /// The minimum value, It is None when all values are null - pub min: Option, - /// The maximum value, It is None when all values are null - pub max: Option, - /// Null values in the page - pub null_count: Option, - /// Repetition level histogram for the page - /// - /// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`. - /// For example, `repetition_level_histogram[0]` indicates how many rows the page contains. - pub repetition_level_histogram: Option, - /// Definition level histogram for the page - /// - /// `definition_level_histogram[i]` is a count of how many values are at definition level `i`. - /// For example, `definition_level_histogram[max_definition_level]` indicates how many - /// non-null values are present in the page. - pub definition_level_histogram: Option, -} - -impl PageIndex { - /// Returns the minimum value in the page - /// - /// It is `None` when all values are null - pub fn min(&self) -> Option<&T> { - self.min.as_ref() - } - - /// Returns the maximum value in the page - /// - /// It is `None` when all values are null - pub fn max(&self) -> Option<&T> { - self.max.as_ref() - } - - /// Returns the number of null values in the page - pub fn null_count(&self) -> Option { - self.null_count - } - - /// Returns the repetition level histogram for the page - pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> { - self.repetition_level_histogram.as_ref() - } - - /// Returns the definition level histogram for the page - pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> { - self.definition_level_histogram.as_ref() - } - - /// Returns whether this is an all null page - pub fn is_null_page(&self) -> bool { - self.min.is_none() - } -} - -impl PageIndex -where - T: AsBytes, -{ - /// Returns the minimum value in the page as bytes - /// - /// It is `None` when all values are null - pub fn max_bytes(&self) -> Option<&[u8]> { - self.max.as_ref().map(|x| x.as_bytes()) - } - - /// Returns the maximum value in the page as bytes - /// - /// It is `None` when all values are null - pub fn min_bytes(&self) -> Option<&[u8]> { - self.min.as_ref().map(|x| x.as_bytes()) - } -} - -#[derive(Debug, Clone, PartialEq)] -#[allow(non_camel_case_types)] -/// Statistics for data pages in a column chunk. -/// -/// See [`NativeIndex`] for more information -pub enum Index { - /// Sometimes reading page index from parquet file - /// will only return pageLocations without min_max index, - /// `NONE` represents this lack of index information - NONE, - /// Boolean type index - BOOLEAN(NativeIndex), - /// 32-bit integer type index - INT32(NativeIndex), - /// 64-bit integer type index - INT64(NativeIndex), - /// 96-bit integer type (timestamp) index - INT96(NativeIndex), - /// 32-bit floating point type index - FLOAT(NativeIndex), - /// 64-bit floating point type index - DOUBLE(NativeIndex), - /// Byte array type index - BYTE_ARRAY(NativeIndex), - /// Fixed length byte array type index - FIXED_LEN_BYTE_ARRAY(NativeIndex), -} - -impl Index { - /// Return min/max elements inside ColumnIndex are ordered or not. - pub fn is_sorted(&self) -> bool { - // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING, - if let Some(order) = self.get_boundary_order() { - order != BoundaryOrder::UNORDERED - } else { - false - } - } - - /// Get boundary_order of this page index. - pub fn get_boundary_order(&self) -> Option { - match self { - Index::NONE => None, - Index::BOOLEAN(index) => Some(index.boundary_order), - Index::INT32(index) => Some(index.boundary_order), - Index::INT64(index) => Some(index.boundary_order), - Index::INT96(index) => Some(index.boundary_order), - Index::FLOAT(index) => Some(index.boundary_order), - Index::DOUBLE(index) => Some(index.boundary_order), - Index::BYTE_ARRAY(index) => Some(index.boundary_order), - Index::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order), - } - } -} - -/// Strongly typed statistics for data pages in a column chunk. -/// -/// This structure is a natively typed, in memory representation of the -/// [`ColumnIndex`] structure in a parquet file footer, as described in the -/// Parquet [PageIndex documentation]. The statistics stored in this structure -/// can be used by query engines to skip decoding pages while reading parquet -/// data. -/// -/// # Differences with Row Group Level Statistics -/// -/// One significant difference between `NativeIndex` and row group level -/// [`Statistics`] is that page level statistics may not store actual column -/// values as min and max (e.g. they may store truncated strings to save space) -/// -/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -/// [`Statistics`]: crate::file::statistics::Statistics -/// [`ColumnIndex`]: crate::format::ColumnIndex -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct NativeIndex { - /// The actual column indexes, one item per page - pub indexes: Vec>, - /// If the min/max elements are ordered, and if so in which - /// direction. See [source] for details. - /// - /// [source]: https://github.com/apache/parquet-format/blob/bfc549b93e6927cb1fc425466e4084f76edc6d22/src/main/thrift/parquet.thrift#L959-L964 - pub boundary_order: BoundaryOrder, -} - -impl NativeIndex { - /// The physical data type of the column - pub const PHYSICAL_TYPE: Type = T::PHYSICAL_TYPE; - - /// Creates a new [`NativeIndex`] - #[allow(dead_code)] - pub(crate) fn try_new(index: crate::format::ColumnIndex) -> Result { - let len = index.min_values.len(); - - let null_counts = index - .null_counts - .map(|x| x.into_iter().map(Some).collect::>()) - .unwrap_or_else(|| vec![None; len]); - - // histograms are a 1D array encoding a 2D num_pages X num_levels matrix. - let to_page_histograms = |opt_hist: Option>| { - if let Some(hist) = opt_hist { - // TODO: should we assert (hist.len() % len) == 0? - let num_levels = hist.len() / len; - let mut res = Vec::with_capacity(len); - for i in 0..len { - let page_idx = i * num_levels; - let page_hist = hist[page_idx..page_idx + num_levels].to_vec(); - res.push(Some(LevelHistogram::from(page_hist))); - } - res - } else { - vec![None; len] - } - }; - - let rep_hists: Vec> = - to_page_histograms(index.repetition_level_histograms); - let def_hists: Vec> = - to_page_histograms(index.definition_level_histograms); - - let indexes = index - .min_values - .iter() - .zip(index.max_values.iter()) - .zip(index.null_pages.into_iter()) - .zip(null_counts.into_iter()) - .zip(rep_hists.into_iter()) - .zip(def_hists.into_iter()) - .map( - |( - ((((min, max), is_null), null_count), repetition_level_histogram), - definition_level_histogram, - )| { - let (min, max) = if is_null { - (None, None) - } else { - ( - Some(T::try_from_le_slice(min)?), - Some(T::try_from_le_slice(max)?), - ) - }; - Ok(PageIndex { - min, - max, - null_count, - repetition_level_histogram, - definition_level_histogram, - }) - }, - ) - .collect::, ParquetError>>()?; - - let boundary_order = index.boundary_order.try_into()?; - Ok(Self { - indexes, - boundary_order, - }) - } - - #[allow(dead_code)] - pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex { - let min_values = self - .indexes - .iter() - .map(|x| x.min_bytes().unwrap_or(&[]).to_vec()) - .collect::>(); - - let max_values = self - .indexes - .iter() - .map(|x| x.max_bytes().unwrap_or(&[]).to_vec()) - .collect::>(); - - let null_counts = self - .indexes - .iter() - .map(|x| x.null_count()) - .collect::>>(); - - // Concatenate page histograms into a single Option - let repetition_level_histograms = self - .indexes - .iter() - .map(|x| x.repetition_level_histogram().map(|v| v.values())) - .collect::>>() - .map(|hists| hists.concat()); - - let definition_level_histograms = self - .indexes - .iter() - .map(|x| x.definition_level_histogram().map(|v| v.values())) - .collect::>>() - .map(|hists| hists.concat()); - - crate::format::ColumnIndex::new( - self.indexes.iter().map(|x| x.min().is_none()).collect(), - min_values, - max_values, - self.boundary_order.into(), - null_counts, - repetition_level_histograms, - definition_level_histograms, - ) - } - - /// Creates a new [`NativeIndex`] - #[allow(dead_code)] - pub(super) fn try_new_local(index: ThriftColumnIndex) -> Result { - let len = index.min_values.len(); - - // turn Option> into Vec> - let null_counts = index - .null_counts - .map(|x| x.into_iter().map(Some).collect::>()) - .unwrap_or_else(|| vec![None; len]); - - // histograms are a 1D array encoding a 2D num_pages X num_levels matrix. - let to_page_histograms = |opt_hist: Option>| { - if let Some(hist) = opt_hist { - // TODO: should we assert (hist.len() % len) == 0? - let num_levels = hist.len() / len; - let mut res = Vec::with_capacity(len); - for i in 0..len { - let page_idx = i * num_levels; - let page_hist = hist[page_idx..page_idx + num_levels].to_vec(); - res.push(Some(LevelHistogram::from(page_hist))); - } - res - } else { - vec![None; len] - } - }; - - // turn Option> into Vec> - let rep_hists: Vec> = - to_page_histograms(index.repetition_level_histograms); - let def_hists: Vec> = - to_page_histograms(index.definition_level_histograms); - - // start assembling Vec - let mut indexes: Vec> = Vec::with_capacity(len); - let mut rep_iter = rep_hists.into_iter(); - let mut def_iter = def_hists.into_iter(); - - // this used to zip together the other iters, but that was quite a bit - // slower than this approach. - for (i, null_count) in null_counts.into_iter().enumerate().take(len) { - let is_null = index.null_pages[i]; - let min = if is_null { - None - } else { - Some(T::try_from_le_slice(index.min_values[i])?) - }; - let max = if is_null { - None - } else { - Some(T::try_from_le_slice(index.max_values[i])?) - }; - - indexes.push(PageIndex { - min, - max, - null_count, - repetition_level_histogram: rep_iter.next().unwrap_or(None), - definition_level_histogram: def_iter.next().unwrap_or(None), - }) - } - - let boundary_order = index.boundary_order; - Ok(Self { - indexes, - boundary_order, - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_page_index_min_max_null() { - let page_index = PageIndex { - min: Some(-123), - max: Some(234), - null_count: Some(0), - repetition_level_histogram: Some(LevelHistogram::from(vec![1, 2])), - definition_level_histogram: Some(LevelHistogram::from(vec![1, 2, 3])), - }; - - assert_eq!(page_index.min().unwrap(), &-123); - assert_eq!(page_index.max().unwrap(), &234); - assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes()); - assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes()); - assert_eq!(page_index.null_count().unwrap(), 0); - assert_eq!( - page_index.repetition_level_histogram().unwrap().values(), - &vec![1, 2] - ); - assert_eq!( - page_index.definition_level_histogram().unwrap().values(), - &vec![1, 2, 3] - ); - } - - #[test] - fn test_page_index_min_max_null_none() { - let page_index: PageIndex = PageIndex { - min: None, - max: None, - null_count: None, - repetition_level_histogram: None, - definition_level_histogram: None, - }; - - assert_eq!(page_index.min(), None); - assert_eq!(page_index.max(), None); - assert_eq!(page_index.min_bytes(), None); - assert_eq!(page_index.max_bytes(), None); - assert_eq!(page_index.null_count(), None); - assert_eq!(page_index.repetition_level_histogram(), None); - assert_eq!(page_index.definition_level_histogram(), None); - } - - #[test] - fn test_invalid_column_index() { - let column_index = crate::format::ColumnIndex { - null_pages: vec![true, false], - min_values: vec![ - vec![], - vec![], // this shouldn't be empty as null_pages[1] is false - ], - max_values: vec![ - vec![], - vec![], // this shouldn't be empty as null_pages[1] is false - ], - null_counts: None, - repetition_level_histograms: None, - definition_level_histograms: None, - boundary_order: crate::format::BoundaryOrder::UNORDERED, - }; - - let err = NativeIndex::::try_new(column_index).unwrap_err(); - assert_eq!( - err.to_string(), - "Parquet error: error converting value, expected 4 bytes got 0" - ); - } -} diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs index ff70e2eca5dd..71b8290d5d36 100644 --- a/parquet/src/file/page_index/mod.rs +++ b/parquet/src/file/page_index/mod.rs @@ -20,6 +20,5 @@ //! [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub mod column_index; -pub mod index; pub mod index_reader; pub mod offset_index; From 29091cd2c9ad02c84f787022293e323beb7445aa Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Sep 2025 12:44:56 -0700 Subject: [PATCH 064/126] refactor column index building --- parquet/src/file/metadata/mod.rs | 111 ++++---------------- parquet/src/file/page_index/column_index.rs | 89 +++++++++++----- parquet/src/file/page_index/index_reader.rs | 28 +++-- 3 files changed, 105 insertions(+), 123 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 47f805641470..7f37b0fd9e54 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -101,9 +101,7 @@ use crate::encryption::decrypt::FileDecryptor; #[cfg(feature = "encryption")] use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData}; pub(crate) use crate::file::metadata::memory::HeapSize; -use crate::file::page_index::column_index::{ - ByteArrayColumnIndex, ColumnIndex, PrimitiveColumnIndex, -}; +use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex}; use crate::file::statistics::{self, Statistics}; use crate::file::{ page_encoding_stats::{self, PageEncodingStats}, @@ -1668,100 +1666,33 @@ impl ColumnIndexBuilder { where T: ParquetValueType, { - let column_index = ColumnIndex { - null_pages: self.null_pages, - boundary_order: self.boundary_order, - null_counts: Some(self.null_counts), - repetition_level_histograms: self.repetition_level_histograms, - definition_level_histograms: self.definition_level_histograms, - }; + let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect(); + let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect(); - // TODO(ets): refactor this into column index - let min_values = self - .min_values - .iter() - .zip(column_index.null_pages.iter()) - .map(|(min, is_null)| { - if *is_null { - Ok(Default::default()) - } else { - Ok(T::try_from_le_slice(min)?) - } - }) - .collect::, ParquetError>>()?; - - let max_values = self - .max_values - .iter() - .zip(column_index.null_pages.iter()) - .map(|(max, is_null)| { - if *is_null { - Ok(Default::default()) - } else { - Ok(T::try_from_le_slice(max)?) - } - }) - .collect::, ParquetError>>()?; - - Ok(PrimitiveColumnIndex { - column_index, + PrimitiveColumnIndex::try_new( + self.null_pages, + self.boundary_order, + Some(self.null_counts), + self.repetition_level_histograms, + self.definition_level_histograms, min_values, max_values, - }) + ) } fn build_byte_array_index(self) -> Result { - // TODO(ets): refactor this into column index so we can reuse all this code - let len = self.null_pages.len(); - - let min_len = self.min_values.iter().map(|v| v.len()).sum(); - let max_len = self.max_values.iter().map(|v| v.len()).sum(); - let mut min_bytes = vec![0u8; min_len]; - let mut max_bytes = vec![0u8; max_len]; - - let mut min_offsets = vec![0usize; len + 1]; - let mut max_offsets = vec![0usize; len + 1]; - - let mut min_pos = 0; - let mut max_pos = 0; - - for (i, is_null) in self.null_pages.iter().enumerate().take(len) { - if !is_null { - let min = &self.min_values[i]; - let dst = &mut min_bytes[min_pos..min_pos + min.len()]; - dst.copy_from_slice(min); - min_offsets[i] = min_pos; - min_pos += min.len(); - - let max = &self.max_values[i]; - let dst = &mut max_bytes[max_pos..max_pos + max.len()]; - dst.copy_from_slice(max); - max_offsets[i] = max_pos; - max_pos += max.len(); - } else { - min_offsets[i] = min_pos; - max_offsets[i] = max_pos; - } - } - - min_offsets[len] = min_pos; - max_offsets[len] = max_pos; + let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect(); + let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect(); - let column_index = ColumnIndex { - null_pages: self.null_pages, - boundary_order: self.boundary_order, - null_counts: Some(self.null_counts), - repetition_level_histograms: self.repetition_level_histograms, - definition_level_histograms: self.definition_level_histograms, - }; - - Ok(ByteArrayColumnIndex { - column_index, - min_bytes, - min_offsets, - max_bytes, - max_offsets, - }) + ByteArrayColumnIndex::try_new( + self.null_pages, + self.boundary_order, + Some(self.null_counts), + self.repetition_level_histograms, + self.definition_level_histograms, + min_values, + max_values, + ) } } diff --git a/parquet/src/file/page_index/column_index.rs b/parquet/src/file/page_index/column_index.rs index 549d1bbd0c66..a0893cc9eae9 100644 --- a/parquet/src/file/page_index/column_index.rs +++ b/parquet/src/file/page_index/column_index.rs @@ -95,18 +95,26 @@ pub struct PrimitiveColumnIndex { } impl PrimitiveColumnIndex { - pub(super) fn try_new(index: ThriftColumnIndex) -> Result { - let len = index.null_pages.len(); + pub(crate) fn try_new( + null_pages: Vec, + boundary_order: BoundaryOrder, + null_counts: Option>, + repetition_level_histograms: Option>, + definition_level_histograms: Option>, + min_bytes: Vec<&[u8]>, + max_bytes: Vec<&[u8]>, + ) -> Result { + let len = null_pages.len(); let mut min_values = Vec::with_capacity(len); let mut max_values = Vec::with_capacity(len); - for (i, is_null) in index.null_pages.iter().enumerate().take(len) { + for (i, is_null) in null_pages.iter().enumerate().take(len) { if !is_null { - let min = index.min_values[i]; + let min = min_bytes[i]; min_values.push(T::try_from_le_slice(min)?); - let max = index.max_values[i]; + let max = max_bytes[i]; max_values.push(T::try_from_le_slice(max)?); } else { // need placeholders @@ -117,16 +125,28 @@ impl PrimitiveColumnIndex { Ok(Self { column_index: ColumnIndex { - null_pages: index.null_pages, - boundary_order: index.boundary_order, - null_counts: index.null_counts, - repetition_level_histograms: index.repetition_level_histograms, - definition_level_histograms: index.definition_level_histograms, + null_pages, + boundary_order, + null_counts, + repetition_level_histograms, + definition_level_histograms, }, min_values, max_values, }) } + + pub(super) fn try_from_thrift(index: ThriftColumnIndex) -> Result { + Self::try_new( + index.null_pages, + index.boundary_order, + index.null_counts, + index.repetition_level_histograms, + index.definition_level_histograms, + index.min_values, + index.max_values, + ) + } } impl PrimitiveColumnIndex { @@ -262,11 +282,19 @@ pub struct ByteArrayColumnIndex { } impl ByteArrayColumnIndex { - pub(super) fn try_new(index: ThriftColumnIndex) -> Result { - let len = index.null_pages.len(); - - let min_len = index.min_values.iter().map(|&v| v.len()).sum(); - let max_len = index.max_values.iter().map(|&v| v.len()).sum(); + pub(crate) fn try_new( + null_pages: Vec, + boundary_order: BoundaryOrder, + null_counts: Option>, + repetition_level_histograms: Option>, + definition_level_histograms: Option>, + min_values: Vec<&[u8]>, + max_values: Vec<&[u8]>, + ) -> Result { + let len = null_pages.len(); + + let min_len = min_values.iter().map(|&v| v.len()).sum(); + let max_len = max_values.iter().map(|&v| v.len()).sum(); let mut min_bytes = vec![0u8; min_len]; let mut max_bytes = vec![0u8; max_len]; @@ -276,15 +304,15 @@ impl ByteArrayColumnIndex { let mut min_pos = 0; let mut max_pos = 0; - for (i, is_null) in index.null_pages.iter().enumerate().take(len) { + for (i, is_null) in null_pages.iter().enumerate().take(len) { if !is_null { - let min = index.min_values[i]; + let min = min_values[i]; let dst = &mut min_bytes[min_pos..min_pos + min.len()]; dst.copy_from_slice(min); min_offsets[i] = min_pos; min_pos += min.len(); - let max = index.max_values[i]; + let max = max_values[i]; let dst = &mut max_bytes[max_pos..max_pos + max.len()]; dst.copy_from_slice(max); max_offsets[i] = max_pos; @@ -300,13 +328,12 @@ impl ByteArrayColumnIndex { Ok(Self { column_index: ColumnIndex { - null_pages: index.null_pages, - boundary_order: index.boundary_order, - null_counts: index.null_counts, - repetition_level_histograms: index.repetition_level_histograms, - definition_level_histograms: index.definition_level_histograms, + null_pages, + boundary_order, + null_counts, + repetition_level_histograms, + definition_level_histograms, }, - min_bytes, min_offsets, max_bytes, @@ -314,6 +341,18 @@ impl ByteArrayColumnIndex { }) } + pub(super) fn try_from_thrift(index: ThriftColumnIndex) -> Result { + Self::try_new( + index.null_pages, + index.boundary_order, + index.null_counts, + index.repetition_level_histograms, + index.definition_level_histograms, + index.min_values, + index.max_values, + ) + } + /// Returns the min value for the page indexed by `idx` /// /// It is `None` when all values are null @@ -697,7 +736,7 @@ mod tests { boundary_order: BoundaryOrder::UNORDERED, }; - let err = PrimitiveColumnIndex::::try_new(column_index).unwrap_err(); + let err = PrimitiveColumnIndex::::try_from_thrift(column_index).unwrap_err(); assert_eq!( err.to_string(), "Parquet error: error converting value, expected 4 bytes got 0" diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 3db597954e6c..fbf97ad92cce 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -171,16 +171,28 @@ pub(crate) fn decode_column_index( let index = match column_type { Type::BOOLEAN => { - ColumnIndexMetaData::BOOLEAN(PrimitiveColumnIndex::::try_new(index)?) + ColumnIndexMetaData::BOOLEAN(PrimitiveColumnIndex::::try_from_thrift(index)?) + } + Type::INT32 => { + ColumnIndexMetaData::INT32(PrimitiveColumnIndex::::try_from_thrift(index)?) + } + Type::INT64 => { + ColumnIndexMetaData::INT64(PrimitiveColumnIndex::::try_from_thrift(index)?) + } + Type::INT96 => { + ColumnIndexMetaData::INT96(PrimitiveColumnIndex::::try_from_thrift(index)?) + } + Type::FLOAT => { + ColumnIndexMetaData::FLOAT(PrimitiveColumnIndex::::try_from_thrift(index)?) + } + Type::DOUBLE => { + ColumnIndexMetaData::DOUBLE(PrimitiveColumnIndex::::try_from_thrift(index)?) + } + Type::BYTE_ARRAY => { + ColumnIndexMetaData::BYTE_ARRAY(ByteArrayColumnIndex::try_from_thrift(index)?) } - Type::INT32 => ColumnIndexMetaData::INT32(PrimitiveColumnIndex::::try_new(index)?), - Type::INT64 => ColumnIndexMetaData::INT64(PrimitiveColumnIndex::::try_new(index)?), - Type::INT96 => ColumnIndexMetaData::INT96(PrimitiveColumnIndex::::try_new(index)?), - Type::FLOAT => ColumnIndexMetaData::FLOAT(PrimitiveColumnIndex::::try_new(index)?), - Type::DOUBLE => ColumnIndexMetaData::DOUBLE(PrimitiveColumnIndex::::try_new(index)?), - Type::BYTE_ARRAY => ColumnIndexMetaData::BYTE_ARRAY(ByteArrayColumnIndex::try_new(index)?), Type::FIXED_LEN_BYTE_ARRAY => { - ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex::try_new(index)?) + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex::try_from_thrift(index)?) } }; From d13463a2b7eaacef91e14fd5d293492a5a7d80e2 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Sep 2025 16:56:51 -0700 Subject: [PATCH 065/126] checkpoint --- parquet/src/arrow/arrow_reader/mod.rs | 3 +- parquet/src/arrow/arrow_writer/mod.rs | 74 +++++------ parquet/src/arrow/async_writer/mod.rs | 6 +- parquet/src/column/writer/mod.rs | 13 +- parquet/src/file/metadata/mod.rs | 6 + parquet/src/file/metadata/thrift_gen.rs | 66 +++++++--- parquet/src/file/metadata/writer.rs | 162 ++++++++++++------------ parquet/src/file/serialized_reader.rs | 4 +- parquet/src/file/writer.rs | 97 ++++++-------- 9 files changed, 216 insertions(+), 215 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 2dd41f986af8..fb29bdf6561a 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1185,6 +1185,7 @@ mod tests { FloatType, Int32Type, Int64Type, Int96, Int96Type, }; use crate::errors::Result; + use crate::file::metadata::ParquetMetaData; use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion}; use crate::file::writer::SerializedFileWriter; use crate::schema::parser::parse_message_type; @@ -2913,7 +2914,7 @@ mod tests { schema: TypePtr, field: Option, opts: &TestOptions, - ) -> Result { + ) -> Result { let mut writer_props = opts.writer_props(); if let Some(field) = field { let arrow_schema = Schema::new(vec![field]); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 0c021aad0c69..56f820a0d94e 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -43,7 +43,7 @@ use crate::data_type::{ByteArray, FixedLenByteArray}; #[cfg(feature = "encryption")] use crate::encryption::encrypt::FileEncryptor; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{KeyValue, RowGroupMetaData}; +use crate::file::metadata::{KeyValue, ParquetMetaData, RowGroupMetaData}; use crate::file::properties::{WriterProperties, WriterPropertiesPtr}; use crate::file::reader::{ChunkReader, Length}; use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter}; @@ -397,13 +397,13 @@ impl ArrowWriter { /// Unlike [`Self::close`] this does not consume self /// /// Attempting to write after calling finish will result in an error - pub fn finish(&mut self) -> Result { + pub fn finish(&mut self) -> Result { self.flush()?; self.writer.finish() } /// Close and finalize the underlying Parquet writer - pub fn close(mut self) -> Result { + pub fn close(mut self) -> Result { self.finish() } @@ -1487,7 +1487,6 @@ mod tests { use crate::arrow::ARROW_SCHEMA_META_KEY; use crate::column::page::{Page, PageReader}; use crate::file::metadata::thrift_gen::PageHeader; - use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::reader::SerializedPageReader; use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; @@ -2556,12 +2555,12 @@ mod tests { ArrowWriter::try_new(&mut out, batch.schema(), None).expect("Unable to write file"); writer.write(&batch).unwrap(); let file_meta_data = writer.close().unwrap(); - for row_group in file_meta_data.row_groups { - for column in row_group.columns { - assert!(column.offset_index_offset.is_some()); - assert!(column.offset_index_length.is_some()); - assert!(column.column_index_offset.is_none()); - assert!(column.column_index_length.is_none()); + for row_group in file_meta_data.row_groups() { + for column in row_group.columns() { + assert!(column.offset_index_offset().is_some()); + assert!(column.offset_index_length().is_some()); + assert!(column.column_index_offset().is_none()); + assert!(column.column_index_length().is_none()); } } } @@ -3011,13 +3010,14 @@ mod tests { let file_metadata = writer.close().unwrap(); // Coerced name of "item" should be "element" - assert_eq!(file_metadata.schema[3].name, "element"); + let schema = file_metadata.file_metadata().schema_descr(); + assert_eq!(schema.column(3).name(), "element"); // Coerced name of "entries" should be "key_value" - assert_eq!(file_metadata.schema[5].name, "key_value"); + assert_eq!(schema.column(5).name(), "key_value"); // Coerced name of "keys" should be "key" - assert_eq!(file_metadata.schema[6].name, "key"); + assert_eq!(schema.column(6).name(), "key"); // Coerced name of "values" should be "value" - assert_eq!(file_metadata.schema[7].name, "value"); + assert_eq!(schema.column(7).name(), "value"); // Double check schema after reading from the file let reader = SerializedFileReader::new(file).unwrap(); @@ -3961,15 +3961,15 @@ mod tests { writer.write(&batch).unwrap(); let metadata = writer.close().unwrap(); - assert_eq!(metadata.row_groups.len(), 1); - let row_group = &metadata.row_groups[0]; - assert_eq!(row_group.columns.len(), 2); + assert_eq!(metadata.num_row_groups(), 1); + let row_group = metadata.row_group(0); + assert_eq!(row_group.num_columns(), 2); // Column "a" has both offset and column index, as requested - assert!(row_group.columns[0].offset_index_offset.is_some()); - assert!(row_group.columns[0].column_index_offset.is_some()); + assert!(row_group.column(0).offset_index_offset().is_some()); + assert!(row_group.column(0).column_index_offset().is_some()); // Column "b" should only have offset index - assert!(row_group.columns[1].offset_index_offset.is_some()); - assert!(row_group.columns[1].column_index_offset.is_none()); + assert!(row_group.column(1).offset_index_offset().is_some()); + assert!(row_group.column(1).column_index_offset().is_none()); let options = ReadOptionsBuilder::new().with_page_index().build(); let reader = SerializedFileReader::new_with_options(Bytes::from(buf), options).unwrap(); @@ -4036,15 +4036,15 @@ mod tests { writer.write(&batch).unwrap(); let metadata = writer.close().unwrap(); - assert_eq!(metadata.row_groups.len(), 1); - let row_group = &metadata.row_groups[0]; - assert_eq!(row_group.columns.len(), 2); + assert_eq!(metadata.num_row_groups(), 1); + let row_group = metadata.row_group(0); + assert_eq!(row_group.num_columns(), 2); // Column "a" should only have offset index - assert!(row_group.columns[0].offset_index_offset.is_some()); - assert!(row_group.columns[0].column_index_offset.is_none()); + assert!(row_group.column(0).offset_index_offset().is_some()); + assert!(row_group.column(0).column_index_offset().is_none()); // Column "b" should only have offset index - assert!(row_group.columns[1].offset_index_offset.is_some()); - assert!(row_group.columns[1].column_index_offset.is_none()); + assert!(row_group.column(1).offset_index_offset().is_some()); + assert!(row_group.column(1).column_index_offset().is_none()); let options = ReadOptionsBuilder::new().with_page_index().build(); let reader = SerializedFileReader::new_with_options(Bytes::from(buf), options).unwrap(); @@ -4320,14 +4320,10 @@ mod tests { writer.write(&batch).unwrap(); let file_metadata = writer.close().unwrap(); - assert_eq!(file_metadata.row_groups.len(), 1); - assert_eq!(file_metadata.row_groups[0].columns.len(), 1); - let chunk_meta = file_metadata.row_groups[0].columns[0] - .meta_data - .as_ref() - .expect("column metadata missing"); - assert!(chunk_meta.encoding_stats.is_some()); - let chunk_page_stats = chunk_meta.encoding_stats.as_ref().unwrap(); + assert_eq!(file_metadata.num_row_groups(), 1); + assert_eq!(file_metadata.row_group(0).num_columns(), 1); + assert!(file_metadata.row_group(0).column(0).page_encoding_stats().is_some()); + let chunk_page_stats = file_metadata.row_group(0).column(0).page_encoding_stats().unwrap(); // check that the read metadata is also correct let options = ReadOptionsBuilder::new().with_page_index().build(); @@ -4338,11 +4334,7 @@ mod tests { let column = rowgroup.metadata().column(0); assert!(column.page_encoding_stats().is_some()); let file_page_stats = column.page_encoding_stats().unwrap(); - let chunk_stats: Vec = chunk_page_stats - .iter() - .map(|x| crate::file::page_encoding_stats::try_from_thrift(x).unwrap()) - .collect(); - assert_eq!(&chunk_stats, file_page_stats); + assert_eq!(chunk_page_stats, file_page_stats); } #[test] diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs index d9133801a1b5..0eebf522a953 100644 --- a/parquet/src/arrow/async_writer/mod.rs +++ b/parquet/src/arrow/async_writer/mod.rs @@ -64,7 +64,7 @@ use crate::{ arrow::{arrow_writer::ArrowWriterOptions, ArrowWriter}, errors::{ParquetError, Result}, file::{ - metadata::{KeyValue, RowGroupMetaData}, + metadata::{KeyValue, ParquetMetaData, RowGroupMetaData}, properties::WriterProperties, }, }; @@ -246,7 +246,7 @@ impl AsyncArrowWriter { /// Unlike [`Self::close`] this does not consume self /// /// Attempting to write after calling finish will result in an error - pub async fn finish(&mut self) -> Result { + pub async fn finish(&mut self) -> Result { let metadata = self.sync_writer.finish()?; // Force to flush the remaining data. @@ -259,7 +259,7 @@ impl AsyncArrowWriter { /// Close and finalize the writer. /// /// All the data in the inner buffer will be force flushed. - pub async fn close(mut self) -> Result { + pub async fn close(mut self) -> Result { self.finish().await } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 4d5a7cfa5e5c..be9cd951f54c 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -3587,19 +3587,14 @@ mod tests { col_writer.close().unwrap(); row_group_writer.close().unwrap(); let file_metadata = writer.close().unwrap(); - assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); - let stats = file_metadata.row_groups[0].columns[0] - .meta_data - .as_ref() - .unwrap() - .statistics - .as_ref() + let stats = file_metadata.row_group(0).column(0) + .statistics() .unwrap(); - assert!(!stats.is_max_value_exact.unwrap()); + assert!(!stats.max_is_exact()); // Truncation of invalid UTF-8 should fall back to binary truncation, so last byte should // be incremented by 1. assert_eq!( - stats.max_value, + stats.max_bytes_opt().map(|v| v.to_vec()), Some([128, 128, 128, 128, 128, 128, 128, 129].to_vec()) ); } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 7f37b0fd9e54..03745a53effc 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -843,6 +843,8 @@ pub struct ColumnChunkMetaData { definition_level_histogram: Option, #[cfg(feature = "encryption")] column_crypto_metadata: Option, + #[cfg(feature = "encryption")] + encrypted_column_metadata: Option>, } /// Histograms for repetition and definition levels. @@ -1227,6 +1229,8 @@ impl ColumnChunkMetaData { definition_level_histogram, #[cfg(feature = "encryption")] column_crypto_metadata, + #[cfg(feature = "encryption")] + encrypted_column_metadata: None, }; Ok(result) } @@ -1365,6 +1369,8 @@ impl ColumnChunkMetaDataBuilder { definition_level_histogram: None, #[cfg(feature = "encryption")] column_crypto_metadata: None, + #[cfg(feature = "encryption")] + encrypted_column_metadata: None, }) } diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 06223bf03af8..931bfbaca3d5 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -68,12 +68,12 @@ pub(crate) struct SchemaElement<'a> { ); thrift_struct!( -pub(crate) struct AesGcmV1<'a> { +pub(crate) struct AesGcmV1 { /// AAD prefix - 1: optional binary<'a> aad_prefix + 1: optional binary aad_prefix /// Unique file identifier part of AAD suffix - 2: optional binary<'a> aad_file_unique + 2: optional binary aad_file_unique /// In files encrypted with AAD prefix without storing it, /// readers must supply the prefix @@ -82,12 +82,12 @@ pub(crate) struct AesGcmV1<'a> { ); thrift_struct!( -pub(crate) struct AesGcmCtrV1<'a> { +pub(crate) struct AesGcmCtrV1 { /// AAD prefix - 1: optional binary<'a> aad_prefix + 1: optional binary aad_prefix /// Unique file identifier part of AAD suffix - 2: optional binary<'a> aad_file_unique + 2: optional binary aad_file_unique /// In files encrypted with AAD prefix without storing it, /// readers must supply the prefix @@ -96,24 +96,24 @@ pub(crate) struct AesGcmCtrV1<'a> { ); thrift_union!( -union EncryptionAlgorithm<'a> { - 1: (AesGcmV1<'a>) AES_GCM_V1 - 2: (AesGcmCtrV1<'a>) AES_GCM_CTR_V1 +union EncryptionAlgorithm { + 1: (AesGcmV1) AES_GCM_V1 + 2: (AesGcmCtrV1) AES_GCM_CTR_V1 } ); #[cfg(feature = "encryption")] thrift_struct!( /// Crypto metadata for files with encrypted footer -pub(crate) struct FileCryptoMetaData<'a> { +pub(crate) struct FileCryptoMetaData { /// Encryption algorithm. This field is only used for files /// with encrypted footer. Files with plaintext footer store algorithm id /// inside footer (FileMetaData structure). - 1: required EncryptionAlgorithm<'a> encryption_algorithm + 1: required EncryptionAlgorithm encryption_algorithm /** Retrieval metadata of key used for encryption of footer, * and (possibly) columns **/ - 2: optional binary<'a> key_metadata + 2: optional binary key_metadata } ); @@ -135,8 +135,8 @@ struct FileMetaData<'a> { 5: optional list key_value_metadata 6: optional string created_by 7: optional list column_orders; - 8: optional EncryptionAlgorithm<'a> encryption_algorithm - 9: optional binary<'a> footer_signing_key_metadata + 8: optional EncryptionAlgorithm encryption_algorithm + 9: optional binary footer_signing_key_metadata } ); @@ -205,6 +205,32 @@ struct ColumnMetaData<'a> { } ); +pub(crate) fn column_meta_data_from_chunk<'a>( + column_chunk: &'a ColumnChunkMetaData, +) -> ColumnMetaData<'a> { + ColumnMetaData { + type_: column_chunk.column_type(), + encodings: column_chunk.encodings.clone(), + codec: column_chunk.compression, + num_values: column_chunk.num_values, + total_uncompressed_size: column_chunk.total_uncompressed_size, + total_compressed_size: column_chunk.total_compressed_size, + data_page_offset: column_chunk.data_page_offset, + index_page_offset: column_chunk.index_page_offset, + dictionary_page_offset: column_chunk.dictionary_page_offset, + statistics: column_chunk.statistics(), + encoding_stats: column_chunk.encoding_stats.clone(), + bloom_filter_offset: column_chunk.bloom_filter_offset, + bloom_filter_length: column_chunk.bloom_filter_length, + size_statistics: Some(SizeStatistics { + unencoded_byte_array_data_bytes: column_chunk.unencoded_byte_array_data_bytes, + repetition_level_histogram: column_chunk.repetition_level_histogram, + definition_level_histogram: column_chunk.definition_level_histogram, + }), + geospatial_statistics: None, + } +} + thrift_struct!( struct BoundingBox { 1: required double xmin; @@ -337,8 +363,6 @@ fn convert_column( let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from); let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from); - // FIXME: need column crypto - let result = ColumnChunkMetaData { column_descr, encodings, @@ -364,6 +388,8 @@ fn convert_column( definition_level_histogram, #[cfg(feature = "encryption")] column_crypto_metadata: column.crypto_metadata, + #[cfg(feature = "encryption")] + encrypted_column_metadata: None, }; Ok(result) } @@ -632,7 +658,7 @@ pub(crate) fn parquet_metadata_with_encryption( } let decryptor = get_file_decryptor( t_file_crypto_metadata.encryption_algorithm, - t_file_crypto_metadata.key_metadata, + t_file_crypto_metadata.key_metadata.as_ref(), file_decryption_properties, )?; let footer_decryptor = decryptor.get_footer_decryptor(); @@ -672,7 +698,7 @@ pub(crate) fn parquet_metadata_with_encryption( // File has a plaintext footer but encryption algorithm is set let file_decryptor_value = get_file_decryptor( algo, - file_meta.footer_signing_key_metadata, + file_meta.footer_signing_key_metadata.as_ref(), file_decryption_properties, )?; if file_decryption_properties.check_plaintext_footer_integrity() && !encrypted_footer { @@ -733,7 +759,7 @@ pub(crate) fn parquet_metadata_with_encryption( #[cfg(feature = "encryption")] pub(super) fn get_file_decryptor( encryption_algorithm: EncryptionAlgorithm, - footer_key_metadata: Option<&[u8]>, + footer_key_metadata: Option<&Vec>, file_decryption_properties: &FileDecryptionProperties, ) -> Result { match encryption_algorithm { @@ -750,7 +776,7 @@ pub(super) fn get_file_decryptor( FileDecryptor::new( file_decryption_properties, - footer_key_metadata, + footer_key_metadata.map(|v| v.as_slice()), aad_file_unique, aad_prefix, ) diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index a09a703adef8..d53cd6c8a576 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -15,22 +15,28 @@ // specific language governing permissions and limitations // under the License. +use crate::file::metadata::thrift_gen::EncryptionAlgorithm; +use crate::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; +use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr}; +use crate::thrift::TSerializable; +use crate::{ + basic::ColumnOrder, + file::metadata::{FileMetaData, ParquetMetaDataBuilder}, + schema::types, +}; #[cfg(feature = "encryption")] -use crate::encryption::{ - encrypt::{ - encrypt_object, encrypt_object_to_vec, write_signed_plaintext_object, FileEncryptor, +use crate::{ + encryption::{ + encrypt::{encrypt_object, write_signed_plaintext_object, FileEncryptor}, + modules::{create_footer_aad, create_module_aad, ModuleType}, }, - modules::{create_footer_aad, create_module_aad, ModuleType}, + file::metadata::thrift_gen::FileCryptoMetaData, }; -#[cfg(feature = "encryption")] -use crate::errors::ParquetError; -use crate::format::EncryptionAlgorithm; -#[cfg(feature = "encryption")] -use crate::format::{AesGcmV1, ColumnCryptoMetaData}; -use crate::schema::types; -use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr}; -use crate::thrift::TSerializable; use crate::{errors::Result, file::page_index::column_index::ColumnIndexMetaData}; +#[cfg(feature = "encryption")] +use crate::{ + file::column_crypto_metadata::ColumnCryptoMetaData, file::metadata::thrift_gen::AesGcmV1, +}; use crate::{ file::writer::{get_file_magic, TrackedWrite}, parquet_thrift::WriteThrift, @@ -53,7 +59,7 @@ pub(crate) struct ThriftMetadataWriter<'a, W: Write> { buf: &'a mut TrackedWrite, schema: &'a TypePtr, schema_descr: &'a SchemaDescPtr, - row_groups: Vec, + row_groups: Vec, column_indexes: Option<&'a [Vec>]>, offset_indexes: Option<&'a [Vec>]>, key_value_metadata: Option>, @@ -130,7 +136,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { } /// Assembles and writes the final metadata to self.buf - pub fn finish(mut self) -> Result { + pub fn finish(mut self) -> Result { let num_rows = self.row_groups.iter().map(|x| x.num_rows).sum(); // Write column indexes and offset indexes @@ -146,35 +152,42 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { // for all leaf nodes. // Even if the column has an undefined sort order, such as INTERVAL, this // is still technically the defined TYPEORDER so it should still be set. - let column_orders = (0..self.schema_descr.num_columns()) - .map(|_| crate::format::ColumnOrder::TYPEORDER(crate::format::TypeDefinedOrder {})) + let column_orders = self + .schema_descr + .columns() + .iter() + .map(|col| { + let sort_order = ColumnOrder::get_sort_order( + col.logical_type(), + col.converted_type(), + col.physical_type(), + ); + ColumnOrder::TYPE_DEFINED_ORDER(sort_order) + }) .collect(); + // This field is optional, perhaps in cases where no min/max fields are set // in any Statistics or ColumnIndex object in the whole file. // But for simplicity we always set this field. let column_orders = Some(column_orders); + let (row_groups, unencrypted_row_groups) = self .object_writer .apply_row_group_encryption(self.row_groups)?; let (encryption_algorithm, footer_signing_key_metadata) = self.object_writer.get_plaintext_footer_crypto_metadata(); - let key_value_metadata = self.key_value_metadata.map(|vkv| { - vkv.into_iter() - .map(|kv| crate::format::KeyValue::new(kv.key, kv.value)) - .collect::>() - }); - let mut file_metadata = crate::format::FileMetaData { + + let mut file_metadata = FileMetaData::new( + self.writer_version, num_rows, - row_groups, - key_value_metadata, - version: self.writer_version, - schema: types::to_thrift(self.schema.as_ref())?, - created_by: self.created_by.clone(), + self.created_by, + self.key_value_metadata, + self.schema_descr.clone(), column_orders, - encryption_algorithm, - footer_signing_key_metadata, - }; + ); + + let schema = types::to_thrift(self.schema.as_ref())?; // Write file metadata let start_pos = self.buf.bytes_written(); @@ -196,14 +209,20 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { file_metadata.row_groups = row_groups; } - Ok(file_metadata) + let builder = ParquetMetaDataBuilder::new(file_metadata); + builder.set_row_groups(self.row_groups.clone()); + + builder.set_column_index(self.column_indexes.map(|ci| ci.to_vec())); + builder.set_offset_index(self.offset_indexes.map(|oi| oi.to_vec())); + + Ok(builder.build()) } pub fn new( buf: &'a mut TrackedWrite, schema: &'a TypePtr, schema_descr: &'a SchemaDescPtr, - row_groups: Vec, + row_groups: &'a mut Vec, created_by: Option, writer_version: i32, ) -> Self { @@ -361,12 +380,7 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { let schema_descr = Arc::new(SchemaDescriptor::new(schema.clone())); let created_by = file_metadata.created_by().map(str::to_string); - let row_groups = self - .metadata - .row_groups() - .iter() - .map(|rg| rg.to_thrift()) - .collect::>(); + let mut row_groups = self.metadata.row_groups.clone(); let key_value_metadata = file_metadata.key_value_metadata().cloned(); @@ -377,7 +391,7 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { &mut self.buf, &schema, &schema_descr, - row_groups, + &mut row_groups, created_by, file_metadata.version(), ); @@ -472,7 +486,7 @@ impl MetadataObjectWriter { fn write_offset_index( &self, offset_index: &OffsetIndexMetaData, - _column_chunk: &crate::format::ColumnChunk, + _column_chunk: &ColumnChunkMetaData, _row_group_idx: usize, _column_idx: usize, sink: impl Write, @@ -484,7 +498,7 @@ impl MetadataObjectWriter { fn write_column_index( &self, column_index: &ColumnIndexMetaData, - _column_chunk: &crate::format::ColumnChunk, + _column_chunk: &ColumnChunkMetaData, _row_group_idx: usize, _column_idx: usize, sink: impl Write, @@ -559,7 +573,7 @@ impl MetadataObjectWriter { fn write_offset_index( &self, offset_index: &OffsetIndexMetaData, - column_chunk: &crate::format::ColumnChunk, + column_chunk: &ColumnChunkMetaData, row_group_idx: usize, column_idx: usize, sink: impl Write, @@ -584,7 +598,7 @@ impl MetadataObjectWriter { fn write_column_index( &self, column_index: &ColumnIndexMetaData, - column_chunk: &crate::format::ColumnChunk, + column_chunk: &ColumnChunkMetaData, row_group_idx: usize, column_idx: usize, sink: impl Write, @@ -608,11 +622,8 @@ impl MetadataObjectWriter { /// and possibly unencrypted metadata to be returned to clients if data was encrypted. fn apply_row_group_encryption( &self, - row_groups: Vec, - ) -> Result<( - Vec, - Option>, - )> { + row_groups: Vec, + ) -> Result<(Vec, Option>)> { match &self.file_encryptor { Some(file_encryptor) => { let unencrypted_row_groups = row_groups.clone(); @@ -636,21 +647,12 @@ impl MetadataObjectWriter { object: &impl WriteThrift, mut sink: impl Write, file_encryptor: &FileEncryptor, - column_metadata: &crate::format::ColumnChunk, + column_metadata: &ColumnChunkMetaData, module_type: ModuleType, row_group_index: usize, column_index: usize, ) -> Result<()> { - let column_path_vec = &column_metadata - .meta_data - .as_ref() - .ok_or_else(|| { - general_err!( - "Column metadata not set for column {} when encrypting object", - column_index - ) - })? - .path_in_schema; + let column_path_vec = column_metadata.column_path().as_ref(); let joined_column_path; let column_path = if column_path_vec.len() == 1 { @@ -699,36 +701,34 @@ impl MetadataObjectWriter { .aad_prefix() .map(|_| !file_encryptor.properties().store_aad_prefix()); let aad_prefix = if file_encryptor.properties().store_aad_prefix() { - file_encryptor.properties().aad_prefix().cloned() + file_encryptor.properties().aad_prefix() } else { None }; - EncryptionAlgorithm::AESGCMV1(AesGcmV1 { - aad_prefix, + EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 { + aad_prefix: aad_prefix.cloned(), aad_file_unique: Some(file_encryptor.aad_file_unique().clone()), supply_aad_prefix, }) } - fn file_crypto_metadata( - file_encryptor: &FileEncryptor, - ) -> Result { + fn file_crypto_metadata(file_encryptor: &FileEncryptor) -> Result { let properties = file_encryptor.properties(); - Ok(crate::format::FileCryptoMetaData { + Ok(FileCryptoMetaData { encryption_algorithm: Self::encryption_algorithm_from_encryptor(file_encryptor), key_metadata: properties.footer_key_metadata().cloned(), }) } fn encrypt_row_groups( - row_groups: Vec, + row_groups: Vec, file_encryptor: &Arc, - ) -> Result> { + ) -> Result> { row_groups .into_iter() .enumerate() .map(|(rg_idx, mut rg)| { - let cols: Result> = rg + let cols: Result> = rg .columns .into_iter() .enumerate() @@ -744,26 +744,27 @@ impl MetadataObjectWriter { /// Apply column encryption to column chunk metadata fn encrypt_column_chunk( - mut column_chunk: crate::format::ColumnChunk, + mut column_chunk: ColumnChunkMetaData, file_encryptor: &Arc, row_group_index: usize, column_index: usize, - ) -> Result { + ) -> Result { // Column crypto metadata should have already been set when the column was created. // Here we apply the encryption by encrypting the column metadata if required. - match column_chunk.crypto_metadata.as_ref() { + match column_chunk.column_crypto_metadata.as_ref() { None => {} - Some(ColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => { + Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY) => { // When uniform encryption is used the footer is already encrypted, // so the column chunk does not need additional encryption. } - Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(col_key)) => { + Some(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(col_key)) => { + use crate::{ + encryption::encrypt::encrypt_thrift_object_to_vec, + file::metadata::thrift_gen::column_meta_data_from_chunk, + }; + let column_path = col_key.path_in_schema.join("."); let mut column_encryptor = file_encryptor.get_column_encryptor(&column_path)?; - let meta_data = column_chunk - .meta_data - .take() - .ok_or_else(|| general_err!("Column metadata not set for encryption"))?; let aad = create_module_aad( file_encryptor.file_aad(), ModuleType::ColumnMetaData, @@ -771,10 +772,11 @@ impl MetadataObjectWriter { column_index, None, )?; - let ciphertext = encrypt_object_to_vec(&meta_data, &mut column_encryptor, &aad)?; + // TODO: create temp ColumnMetaData that we can encrypt + let cc = column_meta_data_from_chunk(&column_chunk); + let ciphertext = encrypt_thrift_object_to_vec(&cc, &mut column_encryptor, &aad)?; column_chunk.encrypted_column_metadata = Some(ciphertext); - debug_assert!(column_chunk.meta_data.is_none()); } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 15b6c6be65e0..5dba7bf3ed49 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -2553,8 +2553,8 @@ mod tests { } let file_metadata = file_writer.close().unwrap(); - assert_eq!(file_metadata.num_rows, 25); - assert_eq!(file_metadata.row_groups.len(), 5); + assert_eq!(file_metadata.file_metadata().num_rows(), 25); + assert_eq!(file_metadata.num_row_groups(), 5); // read only the 3rd row group let read_options = ReadOptionsBuilder::new() diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 985903cad1b0..e960ade7ccf7 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -290,7 +290,7 @@ impl SerializedFileWriter { /// Unlike [`Self::close`] this does not consume self /// /// Attempting to write after calling finish will result in an error - pub fn finish(&mut self) -> Result { + pub fn finish(&mut self) -> Result { self.assert_previous_writer_closed()?; let metadata = self.write_metadata()?; self.buf.flush()?; @@ -298,7 +298,7 @@ impl SerializedFileWriter { } /// Closes and finalises file writer, returning the file metadata. - pub fn close(mut self) -> Result { + pub fn close(mut self) -> Result { self.finish() } @@ -319,7 +319,7 @@ impl SerializedFileWriter { } /// Assembles and writes metadata at the end of the file. - fn write_metadata(&mut self) -> Result { + fn write_metadata(&mut self) -> Result { self.finished = true; // write out any remaining bloom filters after all row groups @@ -333,17 +333,11 @@ impl SerializedFileWriter { None => Some(self.kv_metadatas.clone()), }; - let row_groups = self - .row_groups - .iter() - .map(|v| v.to_thrift()) - .collect::>(); - let mut encoder = ThriftMetadataWriter::new( &mut self.buf, &self.schema, &self.descr, - row_groups, + &mut self.row_groups, Some(self.props.created_by().to_string()), self.props.writer_version().as_num(), ); @@ -1612,7 +1606,7 @@ mod tests { file: W, data: Vec>, compression: Compression, - ) -> crate::format::FileMetaData + ) -> ParquetMetaData where W: Write + Send, R: ChunkReader + From + 'static, @@ -1627,7 +1621,7 @@ mod tests { data: Vec>, value: F, compression: Compression, - ) -> crate::format::FileMetaData + ) -> ParquetMetaData where W: Write + Send, R: ChunkReader + From + 'static, @@ -1698,7 +1692,7 @@ mod tests { /// File write-read roundtrip. /// `data` consists of arrays of values for each row group. - fn test_file_roundtrip(file: File, data: Vec>) -> crate::format::FileMetaData { + fn test_file_roundtrip(file: File, data: Vec>) -> ParquetMetaData { test_roundtrip_i32::(file, data, Compression::UNCOMPRESSED) } @@ -1773,13 +1767,12 @@ mod tests { fn test_column_offset_index_file() { let file = tempfile::tempfile().unwrap(); let file_metadata = test_file_roundtrip(file, vec![vec![1, 2, 3, 4, 5]]); - file_metadata.row_groups.iter().for_each(|row_group| { - row_group.columns.iter().for_each(|column_chunk| { - assert_ne!(None, column_chunk.column_index_offset); - assert_ne!(None, column_chunk.column_index_length); - - assert_ne!(None, column_chunk.offset_index_offset); - assert_ne!(None, column_chunk.offset_index_length); + file_metadata.row_groups().iter().for_each(|row_group| { + row_group.columns().iter().for_each(|column_chunk| { + assert!(column_chunk.column_index_offset().is_some()); + assert!(column_chunk.column_index_length().is_some()); + assert!(column_chunk.offset_index_offset().is_some()); + assert!(column_chunk.offset_index_length().is_some()); }) }); } @@ -2020,15 +2013,15 @@ mod tests { row_group_writer.close().unwrap(); let metadata = file_writer.finish().unwrap(); - assert_eq!(metadata.row_groups.len(), 1); - let row_group = &metadata.row_groups[0]; - assert_eq!(row_group.columns.len(), 2); + assert_eq!(metadata.num_row_groups(), 1); + let row_group = metadata.row_group(0); + assert_eq!(row_group.num_columns(), 2); // Column "a" has both offset and column index, as requested - assert!(row_group.columns[0].offset_index_offset.is_some()); - assert!(row_group.columns[0].column_index_offset.is_some()); + assert!(row_group.column(0).offset_index_offset().is_some()); + assert!(row_group.column(0).column_index_offset().is_some()); // Column "b" should only have offset index - assert!(row_group.columns[1].offset_index_offset.is_some()); - assert!(row_group.columns[1].column_index_offset.is_none()); + assert!(row_group.column(1).offset_index_offset().is_some()); + assert!(row_group.column(1).column_index_offset().is_none()); let err = file_writer.next_row_group().err().unwrap().to_string(); assert_eq!(err, "Parquet error: SerializedFileWriter already finished"); @@ -2082,9 +2075,8 @@ mod tests { row_group_writer.close().unwrap(); let file_metadata = writer.close().unwrap(); - assert_eq!(file_metadata.row_groups.len(), 1); - assert_eq!(file_metadata.row_groups[0].columns.len(), 1); - assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + assert_eq!(file_metadata.num_row_groups(), 1); + assert_eq!(file_metadata.row_group(0).num_columns(), 1); let check_def_hist = |def_hist: &[i64]| { assert_eq!(def_hist.len(), 2); @@ -2092,29 +2084,23 @@ mod tests { assert_eq!(def_hist[1], 7); }; - assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); - let meta_data = file_metadata.row_groups[0].columns[0] - .meta_data - .as_ref() - .unwrap(); - assert!(meta_data.size_statistics.is_some()); - let size_stats = meta_data.size_statistics.as_ref().unwrap(); + let meta_data = file_metadata.row_group(0).column(0); - assert!(size_stats.repetition_level_histogram.is_none()); - assert!(size_stats.definition_level_histogram.is_some()); - assert!(size_stats.unencoded_byte_array_data_bytes.is_some()); + assert!(meta_data.repetition_level_histogram().is_none()); + assert!(meta_data.definition_level_histogram().is_some()); + assert!(meta_data.unencoded_byte_array_data_bytes().is_some()); assert_eq!( unenc_size, - size_stats.unencoded_byte_array_data_bytes.unwrap() + meta_data.unencoded_byte_array_data_bytes().unwrap() ); - check_def_hist(size_stats.definition_level_histogram.as_ref().unwrap()); + check_def_hist(&meta_data.definition_level_histogram().unwrap().values()); // check that the read metadata is also correct let options = ReadOptionsBuilder::new().with_page_index().build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let rfile_metadata = reader.metadata().file_metadata(); - assert_eq!(rfile_metadata.num_rows(), file_metadata.num_rows); + assert_eq!(rfile_metadata.num_rows(), file_metadata.file_metadata().num_rows()); assert_eq!(reader.num_row_groups(), 1); let rowgroup = reader.get_row_group(0).unwrap(); assert_eq!(rowgroup.num_columns(), 1); @@ -2234,9 +2220,8 @@ mod tests { row_group_writer.close().unwrap(); let file_metadata = writer.close().unwrap(); - assert_eq!(file_metadata.row_groups.len(), 1); - assert_eq!(file_metadata.row_groups[0].columns.len(), 1); - assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + assert_eq!(file_metadata.num_row_groups(), 1); + assert_eq!(file_metadata.row_group(0).num_columns(), 1); let check_def_hist = |def_hist: &[i64]| { assert_eq!(def_hist.len(), 4); @@ -2254,25 +2239,19 @@ mod tests { // check that histograms are set properly in the write and read metadata // also check that unencoded_byte_array_data_bytes is not set - assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); - let meta_data = file_metadata.row_groups[0].columns[0] - .meta_data - .as_ref() - .unwrap(); - assert!(meta_data.size_statistics.is_some()); - let size_stats = meta_data.size_statistics.as_ref().unwrap(); - assert!(size_stats.repetition_level_histogram.is_some()); - assert!(size_stats.definition_level_histogram.is_some()); - assert!(size_stats.unencoded_byte_array_data_bytes.is_none()); - check_def_hist(size_stats.definition_level_histogram.as_ref().unwrap()); - check_rep_hist(size_stats.repetition_level_histogram.as_ref().unwrap()); + let meta_data = file_metadata.row_group(0).column(0); + assert!(meta_data.repetition_level_histogram().is_some()); + assert!(meta_data.definition_level_histogram().is_some()); + assert!(meta_data.unencoded_byte_array_data_bytes().is_none()); + check_def_hist(meta_data.definition_level_histogram().unwrap().values()); + check_rep_hist(meta_data.repetition_level_histogram().unwrap().values()); // check that the read metadata is also correct let options = ReadOptionsBuilder::new().with_page_index().build(); let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let rfile_metadata = reader.metadata().file_metadata(); - assert_eq!(rfile_metadata.num_rows(), file_metadata.num_rows); + assert_eq!(rfile_metadata.num_rows(), file_metadata.file_metadata().num_rows()); assert_eq!(reader.num_row_groups(), 1); let rowgroup = reader.get_row_group(0).unwrap(); assert_eq!(rowgroup.num_columns(), 1); From ee810e1781b90a145488966b96d56fb8f432e743 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Sep 2025 18:22:18 -0700 Subject: [PATCH 066/126] checkpoint encrypt column meta --- parquet/src/arrow/arrow_writer/mod.rs | 12 +++- parquet/src/column/writer/mod.rs | 4 +- parquet/src/file/metadata/thrift_gen.rs | 26 -------- parquet/src/file/metadata/writer.rs | 89 ++++++++++++++++++++++--- parquet/src/file/writer.rs | 10 ++- 5 files changed, 100 insertions(+), 41 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 56f820a0d94e..e91a67766496 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -4322,8 +4322,16 @@ mod tests { assert_eq!(file_metadata.num_row_groups(), 1); assert_eq!(file_metadata.row_group(0).num_columns(), 1); - assert!(file_metadata.row_group(0).column(0).page_encoding_stats().is_some()); - let chunk_page_stats = file_metadata.row_group(0).column(0).page_encoding_stats().unwrap(); + assert!(file_metadata + .row_group(0) + .column(0) + .page_encoding_stats() + .is_some()); + let chunk_page_stats = file_metadata + .row_group(0) + .column(0) + .page_encoding_stats() + .unwrap(); // check that the read metadata is also correct let options = ReadOptionsBuilder::new().with_page_index().build(); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index be9cd951f54c..521510b0f21c 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -3587,9 +3587,7 @@ mod tests { col_writer.close().unwrap(); row_group_writer.close().unwrap(); let file_metadata = writer.close().unwrap(); - let stats = file_metadata.row_group(0).column(0) - .statistics() - .unwrap(); + let stats = file_metadata.row_group(0).column(0).statistics().unwrap(); assert!(!stats.max_is_exact()); // Truncation of invalid UTF-8 should fall back to binary truncation, so last byte should // be incremented by 1. diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 931bfbaca3d5..618841cc60df 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -205,32 +205,6 @@ struct ColumnMetaData<'a> { } ); -pub(crate) fn column_meta_data_from_chunk<'a>( - column_chunk: &'a ColumnChunkMetaData, -) -> ColumnMetaData<'a> { - ColumnMetaData { - type_: column_chunk.column_type(), - encodings: column_chunk.encodings.clone(), - codec: column_chunk.compression, - num_values: column_chunk.num_values, - total_uncompressed_size: column_chunk.total_uncompressed_size, - total_compressed_size: column_chunk.total_compressed_size, - data_page_offset: column_chunk.data_page_offset, - index_page_offset: column_chunk.index_page_offset, - dictionary_page_offset: column_chunk.dictionary_page_offset, - statistics: column_chunk.statistics(), - encoding_stats: column_chunk.encoding_stats.clone(), - bloom_filter_offset: column_chunk.bloom_filter_offset, - bloom_filter_length: column_chunk.bloom_filter_length, - size_statistics: Some(SizeStatistics { - unencoded_byte_array_data_bytes: column_chunk.unencoded_byte_array_data_bytes, - repetition_level_histogram: column_chunk.repetition_level_histogram, - definition_level_histogram: column_chunk.definition_level_histogram, - }), - geospatial_statistics: None, - } -} - thrift_struct!( struct BoundingBox { 1: required double xmin; diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index d53cd6c8a576..32f49fff3bb7 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -758,11 +758,6 @@ impl MetadataObjectWriter { // so the column chunk does not need additional encryption. } Some(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(col_key)) => { - use crate::{ - encryption::encrypt::encrypt_thrift_object_to_vec, - file::metadata::thrift_gen::column_meta_data_from_chunk, - }; - let column_path = col_key.path_in_schema.join("."); let mut column_encryptor = file_encryptor.get_column_encryptor(&column_path)?; let aad = create_module_aad( @@ -772,14 +767,92 @@ impl MetadataObjectWriter { column_index, None, )?; - // TODO: create temp ColumnMetaData that we can encrypt - let cc = column_meta_data_from_chunk(&column_chunk); - let ciphertext = encrypt_thrift_object_to_vec(&cc, &mut column_encryptor, &aad)?; + // create temp ColumnMetaData that we can encrypt + let cc = Self::serialize_column_meta_data(&column_chunk)?; + let ciphertext = column_encryptor.encrypt(&cc, &aad)?; + // TODO: remember to not serialize column meta data if encrypted_column_metadata + // is Some column_chunk.encrypted_column_metadata = Some(ciphertext); } } Ok(column_chunk) } + + // serialize the bits of the column chunk needed for a thrift ColumnMetaData + // struct ColumnMetaData { + // 1: required Type type + // 2: required list encodings + // 3: required list path_in_schema + // 4: required CompressionCodec codec + // 5: required i64 num_values + // 6: required i64 total_uncompressed_size + // 7: required i64 total_compressed_size + // 8: optional list key_value_metadata + // 9: required i64 data_page_offset + // 10: optional i64 index_page_offset + // 11: optional i64 dictionary_page_offset + // 12: optional Statistics statistics; + // 13: optional list encoding_stats; + // 14: optional i64 bloom_filter_offset; + // 15: optional i32 bloom_filter_length; + // 16: optional SizeStatistics size_statistics; + // 17: optional GeospatialStatistics geospatial_statistics; + // } + fn serialize_column_meta_data(column_chunk: &ColumnChunkMetaData) -> Result> { + use crate::{file::statistics::page_stats_to_thrift, parquet_thrift::WriteThriftField}; + + let mut buf = Vec::new(); + let mut w = ThriftCompactOutputProtocol::new(&mut buf); + + column_chunk + .column_type() + .write_thrift_field(&mut w, 1, 0)?; + column_chunk.encodings.write_thrift_field(&mut w, 2, 1)?; + let path = column_chunk.column_descr.path().parts(); + let path: Vec<&str> = path.iter().map(|v| v.as_str()).collect(); + path.write_thrift_field(&mut w, 3, 2)?; + column_chunk.compression.write_thrift_field(&mut w, 4, 3)?; + column_chunk.num_values.write_thrift_field(&mut w, 5, 4)?; + column_chunk + .total_uncompressed_size + .write_thrift_field(&mut w, 6, 5)?; + column_chunk + .total_compressed_size + .write_thrift_field(&mut w, 7, 6)?; + // no key_value_metadata here + let mut last_field_id = column_chunk + .data_page_offset + .write_thrift_field(&mut w, 9, 7)?; + if let Some(index_page_offset) = column_chunk.index_page_offset { + last_field_id = index_page_offset.write_thrift_field(&mut w, 10, last_field_id)?; + } + if let Some(dictionary_page_offset) = column_chunk.dictionary_page_offset { + last_field_id = dictionary_page_offset.write_thrift_field(&mut w, 11, last_field_id)?; + } + // PageStatistics is the same as thrift Statistics, but writable + let stats = page_stats_to_thrift(column_chunk.statistics()); + if let Some(stats) = stats { + last_field_id = stats.write_thrift_field(&mut w, 12, last_field_id)?; + } + if let Some(page_encoding_stats) = column_chunk.page_encoding_stats() { + last_field_id = page_encoding_stats.write_thrift_field(&mut w, 13, last_field_id)?; + } + if let Some(bloom_filter_offset) = column_chunk.bloom_filter_offset { + last_field_id = bloom_filter_offset.write_thrift_field(&mut w, 14, last_field_id)?; + } + if let Some(bloom_filter_length) = column_chunk.bloom_filter_length { + last_field_id = bloom_filter_length.write_thrift_field(&mut w, 15, last_field_id)?; + } + if let Some(index_page_offset) = column_chunk.index_page_offset { + // uncomment when we add geo spatial + //last_field_id = index_page_offset.write_thrift_field(&mut w, 16, last_field_id)?; + index_page_offset.write_thrift_field(&mut w, 16, last_field_id)?; + } + // TODO: geo spatial here + w.write_struct_end()?; + + Ok(buf) + } } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index e960ade7ccf7..88c1cebf0351 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -2100,7 +2100,10 @@ mod tests { let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let rfile_metadata = reader.metadata().file_metadata(); - assert_eq!(rfile_metadata.num_rows(), file_metadata.file_metadata().num_rows()); + assert_eq!( + rfile_metadata.num_rows(), + file_metadata.file_metadata().num_rows() + ); assert_eq!(reader.num_row_groups(), 1); let rowgroup = reader.get_row_group(0).unwrap(); assert_eq!(rowgroup.num_columns(), 1); @@ -2251,7 +2254,10 @@ mod tests { let reader = SerializedFileReader::new_with_options(file, options).unwrap(); let rfile_metadata = reader.metadata().file_metadata(); - assert_eq!(rfile_metadata.num_rows(), file_metadata.file_metadata().num_rows()); + assert_eq!( + rfile_metadata.num_rows(), + file_metadata.file_metadata().num_rows() + ); assert_eq!(reader.num_row_groups(), 1); let rowgroup = reader.get_row_group(0).unwrap(); assert_eq!(rowgroup.num_columns(), 1); From 3afcfac737d58f246c35f968d69b392c704255ce Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sun, 7 Sep 2025 15:22:42 -0700 Subject: [PATCH 067/126] checkpoint...write code finished --- parquet/src/encryption/encrypt.rs | 23 ++ parquet/src/file/metadata/thrift_gen.rs | 329 +++++++++++++++++++++++- parquet/src/file/metadata/writer.rs | 202 ++++++--------- parquet/src/file/writer.rs | 2 +- parquet/src/schema/types.rs | 21 ++ 5 files changed, 448 insertions(+), 129 deletions(-) diff --git a/parquet/src/encryption/encrypt.rs b/parquet/src/encryption/encrypt.rs index 97893021699e..fe9580551af3 100644 --- a/parquet/src/encryption/encrypt.rs +++ b/parquet/src/encryption/encrypt.rs @@ -412,6 +412,29 @@ pub(crate) fn write_signed_plaintext_object( Ok(()) } +pub(crate) fn write_signed_plaintext_thrift_object( + object: &T, + encryptor: &mut Box, + sink: &mut W, + module_aad: &[u8], +) -> Result<()> { + let mut buffer: Vec = vec![]; + { + let mut protocol = ThriftCompactOutputProtocol::new(&mut buffer); + object.write_thrift(&mut protocol)?; + } + sink.write_all(&buffer)?; + buffer = encryptor.encrypt(buffer.as_ref(), module_aad)?; + + // Format of encrypted buffer is: [ciphertext size, nonce, ciphertext, authentication tag] + let nonce = &buffer[SIZE_LEN..SIZE_LEN + NONCE_LEN]; + let tag = &buffer[buffer.len() - TAG_LEN..]; + sink.write_all(nonce)?; + sink.write_all(tag)?; + + Ok(()) +} + /// Encrypt a Thrift serializable object to a byte vector pub(crate) fn encrypt_object_to_vec( object: &T, diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 618841cc60df..75673d2fca62 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -38,7 +38,9 @@ use crate::{ read_thrift_vec, ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, }, - schema::types::{parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor}, + schema::types::{ + num_nodes, parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor, TypePtr, + }, thrift_struct, thrift_union, util::bit_util::FromBytes, }; @@ -1154,6 +1156,331 @@ impl PageHeader { } } +///////////////////////////////////////////////// +// helper functions for writing file meta data + +// serialize the bits of the column chunk needed for a thrift ColumnMetaData +// struct ColumnMetaData { +// 1: required Type type +// 2: required list encodings +// 3: required list path_in_schema +// 4: required CompressionCodec codec +// 5: required i64 num_values +// 6: required i64 total_uncompressed_size +// 7: required i64 total_compressed_size +// 8: optional list key_value_metadata +// 9: required i64 data_page_offset +// 10: optional i64 index_page_offset +// 11: optional i64 dictionary_page_offset +// 12: optional Statistics statistics; +// 13: optional list encoding_stats; +// 14: optional i64 bloom_filter_offset; +// 15: optional i32 bloom_filter_length; +// 16: optional SizeStatistics size_statistics; +// 17: optional GeospatialStatistics geospatial_statistics; +// } +pub(crate) fn serialize_column_meta_data( + column_chunk: &ColumnChunkMetaData, + w: &mut ThriftCompactOutputProtocol, +) -> Result<()> { + use crate::file::statistics::page_stats_to_thrift; + + column_chunk.column_type().write_thrift_field(w, 1, 0)?; + column_chunk.encodings.write_thrift_field(w, 2, 1)?; + let path = column_chunk.column_descr.path().parts(); + let path: Vec<&str> = path.iter().map(|v| v.as_str()).collect(); + path.write_thrift_field(w, 3, 2)?; + column_chunk.compression.write_thrift_field(w, 4, 3)?; + column_chunk.num_values.write_thrift_field(w, 5, 4)?; + column_chunk + .total_uncompressed_size + .write_thrift_field(w, 6, 5)?; + column_chunk + .total_compressed_size + .write_thrift_field(w, 7, 6)?; + // no key_value_metadata here + let mut last_field_id = column_chunk.data_page_offset.write_thrift_field(w, 9, 7)?; + if let Some(index_page_offset) = column_chunk.index_page_offset { + last_field_id = index_page_offset.write_thrift_field(w, 10, last_field_id)?; + } + if let Some(dictionary_page_offset) = column_chunk.dictionary_page_offset { + last_field_id = dictionary_page_offset.write_thrift_field(w, 11, last_field_id)?; + } + // PageStatistics is the same as thrift Statistics, but writable + let stats = page_stats_to_thrift(column_chunk.statistics()); + if let Some(stats) = stats { + last_field_id = stats.write_thrift_field(w, 12, last_field_id)?; + } + if let Some(page_encoding_stats) = column_chunk.page_encoding_stats() { + last_field_id = page_encoding_stats.write_thrift_field(w, 13, last_field_id)?; + } + if let Some(bloom_filter_offset) = column_chunk.bloom_filter_offset { + last_field_id = bloom_filter_offset.write_thrift_field(w, 14, last_field_id)?; + } + if let Some(bloom_filter_length) = column_chunk.bloom_filter_length { + last_field_id = bloom_filter_length.write_thrift_field(w, 15, last_field_id)?; + } + if let Some(index_page_offset) = column_chunk.index_page_offset { + // uncomment when we add geo spatial + //last_field_id = index_page_offset.write_thrift_field(w, 16, last_field_id)?; + index_page_offset.write_thrift_field(w, 16, last_field_id)?; + } + // TODO: field 17 geo spatial stats here + w.write_struct_end() +} + +// temp struct used for writing +pub(crate) struct FileMeta<'a> { + pub(crate) file_metadata: &'a crate::file::metadata::FileMetaData, + pub(crate) row_groups: &'a Vec, + pub(crate) encryption_algorithm: Option, + pub(crate) footer_signing_key_metadata: Option>, +} + +impl<'a> WriteThrift for FileMeta<'a> { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + #[allow(unused_assignments)] + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + self.file_metadata + .version + .write_thrift_field(writer, 1, 0)?; + + // field 2 is schema. do depth-first traversal of tree, converting to SchemaElement and + // writing along the way. + let root = self.file_metadata.schema_descr().root_schema_ptr(); + let schema_len = num_nodes(&root); + writer.write_field_begin(FieldType::List, 2, 1)?; + writer.write_list_begin(ElementType::Struct, schema_len)?; + // recursively write Type nodes as SchemaElements + write_schema(&root, writer)?; + + self.file_metadata + .num_rows + .write_thrift_field(writer, 3, 2)?; + + // this will call RowGroupMetaData::write_thrift + let mut last_field_id = self.row_groups.write_thrift_field(writer, 4, 3)?; + + if let Some(kv_metadata) = self.file_metadata.key_value_metadata() { + last_field_id = kv_metadata.write_thrift_field(writer, 5, last_field_id)?; + } + if let Some(created_by) = self.file_metadata.created_by() { + last_field_id = created_by.write_thrift_field(writer, 6, last_field_id)?; + } + if let Some(column_orders) = self.file_metadata.column_orders() { + last_field_id = column_orders.write_thrift_field(writer, 7, last_field_id)?; + } + if let Some(algo) = self.encryption_algorithm.as_ref() { + last_field_id = algo.write_thrift_field(writer, 8, last_field_id)?; + } + if let Some(key) = self.footer_signing_key_metadata.as_ref() { + key.as_slice() + .write_thrift_field(writer, 9, last_field_id)?; + } + + writer.write_struct_end() + } +} + +fn write_schema( + node: &TypePtr, + writer: &mut ThriftCompactOutputProtocol, +) -> Result<()> { + match node.as_ref() { + crate::schema::types::Type::PrimitiveType { + basic_info, + physical_type, + type_length, + scale, + precision, + } => { + let element = SchemaElement { + type_: Some(*physical_type), + type_length: if *type_length >= 0 { + Some(*type_length) + } else { + None + }, + repetition_type: Some(basic_info.repetition()), + name: basic_info.name(), + num_children: None, + converted_type: basic_info.converted_type().into(), + scale: if *scale >= 0 { Some(*scale) } else { None }, + precision: if *precision >= 0 { + Some(*precision) + } else { + None + }, + field_id: if basic_info.has_id() { + Some(basic_info.id()) + } else { + None + }, + logical_type: basic_info.logical_type(), + }; + element.write_thrift(writer) + } + crate::schema::types::Type::GroupType { basic_info, fields } => { + let repetition = if basic_info.has_repetition() { + Some(basic_info.repetition()) + } else { + None + }; + + let element = SchemaElement { + type_: None, + type_length: None, + repetition_type: repetition, + name: basic_info.name(), + num_children: Some(fields.len() as i32), + converted_type: basic_info.converted_type().into(), + scale: None, + precision: None, + field_id: if basic_info.has_id() { + Some(basic_info.id()) + } else { + None + }, + logical_type: basic_info.logical_type(), + }; + + element.write_thrift(writer)?; + + // Add child elements for a group + for field in fields { + write_schema(field, writer)?; + } + Ok(()) + } + } +} + +// struct RowGroup { +// 1: required list columns +// 2: required i64 total_byte_size +// 3: required i64 num_rows +// 4: optional list sorting_columns +// 5: optional i64 file_offset +// 6: optional i64 total_compressed_size +// 7: optional i16 ordinal +// } +impl WriteThrift for RowGroupMetaData { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + // this will call ColumnChunkMetaData::write_thrift + self.columns.write_thrift_field(writer, 1, 0)?; + self.total_byte_size.write_thrift_field(writer, 2, 1)?; + let mut last_field_id = self.num_rows.write_thrift_field(writer, 3, 2)?; + if let Some(sorting_columns) = self.sorting_columns() { + last_field_id = sorting_columns.write_thrift_field(writer, 4, last_field_id)?; + } + if let Some(file_offset) = self.file_offset() { + last_field_id = file_offset.write_thrift_field(writer, 5, last_field_id)?; + } + // this is optional, but we'll always write it + last_field_id = self + .compressed_size() + .write_thrift_field(writer, 6, last_field_id)?; + if let Some(ordinal) = self.ordinal() { + ordinal.write_thrift_field(writer, 7, last_field_id)?; + } + writer.write_struct_end() + } +} + +// struct ColumnChunk { +// 1: optional string file_path +// 2: required i64 file_offset = 0 +// 3: optional ColumnMetaData meta_data +// 4: optional i64 offset_index_offset +// 5: optional i32 offset_index_length +// 6: optional i64 column_index_offset +// 7: optional i32 column_index_length +// 8: optional ColumnCryptoMetaData crypto_metadata +// 9: optional binary encrypted_column_metadata +// } +#[cfg(feature = "encryption")] +impl WriteThrift for ColumnChunkMetaData { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let mut last_field_id = 0i16; + if let Some(file_path) = self.file_path() { + last_field_id = file_path.write_thrift_field(writer, 1, last_field_id)?; + } + last_field_id = self + .file_offset() + .write_thrift_field(writer, 2, last_field_id)?; + + // only write the ColumnMetaData if we haven't already encrypted it + if self.encrypted_column_metadata.is_none() { + writer.write_field_begin(FieldType::Struct, 3, last_field_id)?; + serialize_column_meta_data(&self, writer)?; + last_field_id = 3; + } + + if let Some(offset_idx_off) = self.offset_index_offset() { + last_field_id = offset_idx_off.write_thrift_field(writer, 4, last_field_id)?; + } + if let Some(offset_idx_len) = self.offset_index_length() { + last_field_id = offset_idx_len.write_thrift_field(writer, 5, last_field_id)?; + } + if let Some(column_idx_off) = self.column_index_offset() { + last_field_id = column_idx_off.write_thrift_field(writer, 6, last_field_id)?; + } + if let Some(column_idx_len) = self.column_index_length() { + last_field_id = column_idx_len.write_thrift_field(writer, 7, last_field_id)?; + } + if let Some(crypto_metadata) = self.crypto_metadata() { + last_field_id = crypto_metadata.write_thrift_field(writer, 8, last_field_id)?; + } + if let Some(encrypted_meta) = self.encrypted_column_metadata.as_ref() { + encrypted_meta + .as_slice() + .write_thrift_field(writer, 9, last_field_id)?; + } + + writer.write_struct_end() + } +} + +#[cfg(not(feature = "encryption"))] +impl WriteThrift for ColumnChunkMetaData { + const ELEMENT_TYPE: ElementType = ElementType::Struct; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let mut last_field_id = 0i16; + if let Some(file_path) = self.file_path() { + last_field_id = file_path.write_thrift_field(writer, 1, last_field_id)?; + } + last_field_id = self + .file_offset() + .write_thrift_field(writer, 2, last_field_id)?; + + // always write the ColumnMetaData + writer.write_field_begin(FieldType::Struct, 3, last_field_id)?; + serialize_column_meta_data(&self, writer)?; + last_field_id = 3; + + if let Some(offset_idx_off) = self.offset_index_offset() { + last_field_id = offset_idx_off.write_thrift_field(writer, 4, last_field_id)?; + } + if let Some(offset_idx_len) = self.offset_index_length() { + last_field_id = offset_idx_len.write_thrift_field(writer, 5, last_field_id)?; + } + if let Some(column_idx_off) = self.column_index_offset() { + last_field_id = column_idx_off.write_thrift_field(writer, 6, last_field_id)?; + } + if let Some(column_idx_len) = self.column_index_length() { + column_idx_len.write_thrift_field(writer, 7, last_field_id)?; + } + + writer.write_struct_end() + } +} + #[cfg(test)] mod tests { use crate::file::metadata::thrift_gen::BoundingBox; diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 32f49fff3bb7..e32051c549ac 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -15,28 +15,27 @@ // specific language governing permissions and limitations // under the License. -use crate::file::metadata::thrift_gen::EncryptionAlgorithm; -use crate::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; +use crate::file::metadata::thrift_gen::{EncryptionAlgorithm, FileMeta}; +use crate::file::metadata::{ + ColumnChunkMetaData, ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData, +}; use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr}; use crate::thrift::TSerializable; use crate::{ basic::ColumnOrder, file::metadata::{FileMetaData, ParquetMetaDataBuilder}, - schema::types, }; #[cfg(feature = "encryption")] use crate::{ encryption::{ - encrypt::{encrypt_object, write_signed_plaintext_object, FileEncryptor}, + encrypt::{encrypt_thrift_object, write_signed_plaintext_thrift_object, FileEncryptor}, modules::{create_footer_aad, create_module_aad, ModuleType}, }, - file::metadata::thrift_gen::FileCryptoMetaData, + file::column_crypto_metadata::ColumnCryptoMetaData, + file::metadata::thrift_gen::{AesGcmV1, FileCryptoMetaData}, }; use crate::{errors::Result, file::page_index::column_index::ColumnIndexMetaData}; -#[cfg(feature = "encryption")] -use crate::{ - file::column_crypto_metadata::ColumnCryptoMetaData, file::metadata::thrift_gen::AesGcmV1, -}; + use crate::{ file::writer::{get_file_magic, TrackedWrite}, parquet_thrift::WriteThrift, @@ -178,7 +177,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { let (encryption_algorithm, footer_signing_key_metadata) = self.object_writer.get_plaintext_footer_crypto_metadata(); - let mut file_metadata = FileMetaData::new( + let file_metadata = FileMetaData::new( self.writer_version, num_rows, self.created_by, @@ -187,12 +186,17 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { column_orders, ); - let schema = types::to_thrift(self.schema.as_ref())?; + let file_meta = FileMeta { + file_metadata: &file_metadata, + row_groups: &row_groups, + encryption_algorithm, + footer_signing_key_metadata, + }; // Write file metadata let start_pos = self.buf.bytes_written(); self.object_writer - .write_file_metadata(&file_metadata, &mut self.buf)?; + .write_file_metadata(&file_meta, &mut self.buf)?; let end_pos = self.buf.bytes_written(); // Write footer @@ -201,19 +205,44 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { self.buf.write_all(&metadata_len.to_le_bytes())?; self.buf.write_all(self.object_writer.get_file_magic())?; - if let Some(row_groups) = unencrypted_row_groups { - // If row group metadata was encrypted, we replace the encrypted row groups with - // unencrypted metadata before it is returned to users. This allows the metadata - // to be usable for retrieving the row group statistics for example, without users - // needing to decrypt the metadata. - file_metadata.row_groups = row_groups; - } + // If row group metadata was encrypted, we replace the encrypted row groups with + // unencrypted metadata before it is returned to users. This allows the metadata + // to be usable for retrieving the row group statistics for example, without users + // needing to decrypt the metadata. + let mut builder = ParquetMetaDataBuilder::new(file_metadata); - let builder = ParquetMetaDataBuilder::new(file_metadata); - builder.set_row_groups(self.row_groups.clone()); + builder = match unencrypted_row_groups { + Some(rg) => builder.set_row_groups(rg), + None => builder.set_row_groups(row_groups), + }; - builder.set_column_index(self.column_indexes.map(|ci| ci.to_vec())); - builder.set_offset_index(self.offset_indexes.map(|oi| oi.to_vec())); + let column_indexes: Option = self.column_indexes.map(|ovvi| { + ovvi.iter() + .map(|vi| { + vi.iter() + .map(|oi| { + oi.as_ref() + .map(|i| i.clone()) + .unwrap_or(ColumnIndexMetaData::NONE) + }) + .collect() + }) + .collect() + }); + + // FIXME(ets): this will panic if there's a missing index. + let offset_indexes: Option = self.offset_indexes.map(|ovvi| { + ovvi.iter() + .map(|vi| { + vi.iter() + .map(|oi| oi.as_ref().map(|i| i.clone()).unwrap()) + .collect() + }) + .collect() + }); + + builder = builder.set_column_index(column_indexes); + builder = builder.set_offset_index(offset_indexes); Ok(builder.build()) } @@ -222,7 +251,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { buf: &'a mut TrackedWrite, schema: &'a TypePtr, schema_descr: &'a SchemaDescPtr, - row_groups: &'a mut Vec, + row_groups: Vec, created_by: Option, writer_version: i32, ) -> Self { @@ -380,7 +409,7 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { let schema_descr = Arc::new(SchemaDescriptor::new(schema.clone())); let created_by = file_metadata.created_by().map(str::to_string); - let mut row_groups = self.metadata.row_groups.clone(); + let row_groups = self.metadata.row_groups.clone(); let key_value_metadata = file_metadata.key_value_metadata().cloned(); @@ -391,7 +420,7 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { &mut self.buf, &schema, &schema_descr, - &mut row_groups, + row_groups, created_by, file_metadata.version(), ); @@ -406,6 +435,8 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { } fn convert_column_indexes(&self) -> Vec>> { + // FIXME(ets): we're converting from ParquetColumnIndex to vec>, + // but then converting back to ParquetColumnIndex in the end. need to unify this. if let Some(row_group_column_indexes) = self.metadata.column_index() { (0..self.metadata.row_groups().len()) .map(|rg_idx| { @@ -474,12 +505,8 @@ impl MetadataObjectWriter { #[cfg(not(feature = "encryption"))] impl MetadataObjectWriter { /// Write [`FileMetaData`] in Thrift format - fn write_file_metadata( - &self, - file_metadata: &crate::format::FileMetaData, - sink: impl Write, - ) -> Result<()> { - Self::write_object(file_metadata, sink) + fn write_file_metadata(&self, file_metadata: &FileMeta, sink: impl Write) -> Result<()> { + Self::write_thrift_object(file_metadata, sink) } /// Write a column [`OffsetIndex`] in Thrift format @@ -509,11 +536,8 @@ impl MetadataObjectWriter { /// No-op implementation of row-group metadata encryption fn apply_row_group_encryption( &self, - row_groups: Vec, - ) -> Result<( - Vec, - Option>, - )> { + row_groups: Vec, + ) -> Result<(Vec, Option>)> { Ok((row_groups, None)) } @@ -541,29 +565,25 @@ impl MetadataObjectWriter { /// Write [`FileMetaData`] in Thrift format, possibly encrypting it if required /// /// [`FileMetaData`]: crate::format::FileMetaData - fn write_file_metadata( - &self, - file_metadata: &crate::format::FileMetaData, - mut sink: impl Write, - ) -> Result<()> { + fn write_file_metadata(&self, file_metadata: &FileMeta, mut sink: impl Write) -> Result<()> { match self.file_encryptor.as_ref() { Some(file_encryptor) if file_encryptor.properties().encrypt_footer() => { // First write FileCryptoMetadata let crypto_metadata = Self::file_crypto_metadata(file_encryptor)?; - let mut protocol = TCompactOutputProtocol::new(&mut sink); - crypto_metadata.write_to_out_protocol(&mut protocol)?; + let mut protocol = ThriftCompactOutputProtocol::new(&mut sink); + crypto_metadata.write_thrift(&mut protocol)?; // Then write encrypted footer let aad = create_footer_aad(file_encryptor.file_aad())?; let mut encryptor = file_encryptor.get_footer_encryptor()?; - encrypt_object(file_metadata, &mut encryptor, &mut sink, &aad) + encrypt_thrift_object(file_metadata, &mut encryptor, &mut sink, &aad) } Some(file_encryptor) if file_metadata.encryption_algorithm.is_some() => { let aad = create_footer_aad(file_encryptor.file_aad())?; let mut encryptor = file_encryptor.get_footer_encryptor()?; - write_signed_plaintext_object(file_metadata, &mut encryptor, &mut sink, &aad) + write_signed_plaintext_thrift_object(file_metadata, &mut encryptor, &mut sink, &aad) } - _ => Self::write_object(file_metadata, &mut sink), + _ => Self::write_thrift_object(file_metadata, &mut sink), } } @@ -663,8 +683,6 @@ impl MetadataObjectWriter { }; if file_encryptor.is_column_encrypted(column_path) { - use crate::encryption::encrypt::encrypt_thrift_object; - let aad = create_module_aad( file_encryptor.file_aad(), module_type, @@ -758,6 +776,8 @@ impl MetadataObjectWriter { // so the column chunk does not need additional encryption. } Some(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(col_key)) => { + use crate::file::metadata::thrift_gen::serialize_column_meta_data; + let column_path = col_key.path_in_schema.join("."); let mut column_encryptor = file_encryptor.get_column_encryptor(&column_path)?; let aad = create_module_aad( @@ -768,8 +788,12 @@ impl MetadataObjectWriter { None, )?; // create temp ColumnMetaData that we can encrypt - let cc = Self::serialize_column_meta_data(&column_chunk)?; - let ciphertext = column_encryptor.encrypt(&cc, &aad)?; + let mut buffer: Vec = vec![]; + { + let mut prot = ThriftCompactOutputProtocol::new(&mut buffer); + serialize_column_meta_data(&column_chunk, &mut prot)?; + } + let ciphertext = column_encryptor.encrypt(&buffer, &aad)?; // TODO: remember to not serialize column meta data if encrypted_column_metadata // is Some @@ -779,80 +803,4 @@ impl MetadataObjectWriter { Ok(column_chunk) } - - // serialize the bits of the column chunk needed for a thrift ColumnMetaData - // struct ColumnMetaData { - // 1: required Type type - // 2: required list encodings - // 3: required list path_in_schema - // 4: required CompressionCodec codec - // 5: required i64 num_values - // 6: required i64 total_uncompressed_size - // 7: required i64 total_compressed_size - // 8: optional list key_value_metadata - // 9: required i64 data_page_offset - // 10: optional i64 index_page_offset - // 11: optional i64 dictionary_page_offset - // 12: optional Statistics statistics; - // 13: optional list encoding_stats; - // 14: optional i64 bloom_filter_offset; - // 15: optional i32 bloom_filter_length; - // 16: optional SizeStatistics size_statistics; - // 17: optional GeospatialStatistics geospatial_statistics; - // } - fn serialize_column_meta_data(column_chunk: &ColumnChunkMetaData) -> Result> { - use crate::{file::statistics::page_stats_to_thrift, parquet_thrift::WriteThriftField}; - - let mut buf = Vec::new(); - let mut w = ThriftCompactOutputProtocol::new(&mut buf); - - column_chunk - .column_type() - .write_thrift_field(&mut w, 1, 0)?; - column_chunk.encodings.write_thrift_field(&mut w, 2, 1)?; - let path = column_chunk.column_descr.path().parts(); - let path: Vec<&str> = path.iter().map(|v| v.as_str()).collect(); - path.write_thrift_field(&mut w, 3, 2)?; - column_chunk.compression.write_thrift_field(&mut w, 4, 3)?; - column_chunk.num_values.write_thrift_field(&mut w, 5, 4)?; - column_chunk - .total_uncompressed_size - .write_thrift_field(&mut w, 6, 5)?; - column_chunk - .total_compressed_size - .write_thrift_field(&mut w, 7, 6)?; - // no key_value_metadata here - let mut last_field_id = column_chunk - .data_page_offset - .write_thrift_field(&mut w, 9, 7)?; - if let Some(index_page_offset) = column_chunk.index_page_offset { - last_field_id = index_page_offset.write_thrift_field(&mut w, 10, last_field_id)?; - } - if let Some(dictionary_page_offset) = column_chunk.dictionary_page_offset { - last_field_id = dictionary_page_offset.write_thrift_field(&mut w, 11, last_field_id)?; - } - // PageStatistics is the same as thrift Statistics, but writable - let stats = page_stats_to_thrift(column_chunk.statistics()); - if let Some(stats) = stats { - last_field_id = stats.write_thrift_field(&mut w, 12, last_field_id)?; - } - if let Some(page_encoding_stats) = column_chunk.page_encoding_stats() { - last_field_id = page_encoding_stats.write_thrift_field(&mut w, 13, last_field_id)?; - } - if let Some(bloom_filter_offset) = column_chunk.bloom_filter_offset { - last_field_id = bloom_filter_offset.write_thrift_field(&mut w, 14, last_field_id)?; - } - if let Some(bloom_filter_length) = column_chunk.bloom_filter_length { - last_field_id = bloom_filter_length.write_thrift_field(&mut w, 15, last_field_id)?; - } - if let Some(index_page_offset) = column_chunk.index_page_offset { - // uncomment when we add geo spatial - //last_field_id = index_page_offset.write_thrift_field(&mut w, 16, last_field_id)?; - index_page_offset.write_thrift_field(&mut w, 16, last_field_id)?; - } - // TODO: geo spatial here - w.write_struct_end()?; - - Ok(buf) - } } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 88c1cebf0351..2d311bb7fde4 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -337,7 +337,7 @@ impl SerializedFileWriter { &mut self.buf, &self.schema, &self.descr, - &mut self.row_groups, + self.row_groups.clone(), // FIXME(ets): I really want the writer to take ownership of everything Some(self.props.created_by().to_string()), self.props.writer_version().as_num(), ); diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 1406295c3a4f..5c9f97ac43cd 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1114,6 +1114,27 @@ impl SchemaDescriptor { } } +// walk tree and count nodes +pub(crate) fn num_nodes(tp: &TypePtr) -> usize { + let mut n_nodes = 1usize; // count root + for f in tp.get_fields().iter() { + count_nodes(f, &mut n_nodes); + } + n_nodes +} + +pub(crate) fn count_nodes(tp: &TypePtr, n_nodes: &mut usize) { + *n_nodes += 1; + match tp.as_ref() { + Type::GroupType { ref fields, .. } => { + for f in fields { + count_leaves(f, n_nodes); + } + } + _ => (), + } +} + // do a quick walk of the tree to get proper sizing for SchemaDescriptor arrays fn num_leaves(tp: &TypePtr) -> usize { let mut n_leaves = 0usize; From 486d8519bc90ea0c92573ff868f1d96534dbf403 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sun, 7 Sep 2025 22:48:18 -0700 Subject: [PATCH 068/126] checkpoint...almost works --- parquet/src/arrow/arrow_writer/mod.rs | 1 + parquet/src/file/metadata/thrift_gen.rs | 39 ++++++++-- parquet/src/file/metadata/writer.rs | 96 +++++++++++-------------- parquet/src/file/writer.rs | 15 ++-- parquet/src/schema/types.rs | 2 +- 5 files changed, 88 insertions(+), 65 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index e91a67766496..dc5bd6b1af35 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -3011,6 +3011,7 @@ mod tests { // Coerced name of "item" should be "element" let schema = file_metadata.file_metadata().schema_descr(); + println!("schema {schema:?}"); assert_eq!(schema.column(3).name(), "element"); // Coerced name of "entries" should be "key_value" assert_eq!(schema.column(5).name(), "key_value"); diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 75673d2fca62..f14859893b85 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -1220,11 +1220,32 @@ pub(crate) fn serialize_column_meta_data( if let Some(bloom_filter_length) = column_chunk.bloom_filter_length { last_field_id = bloom_filter_length.write_thrift_field(w, 15, last_field_id)?; } - if let Some(index_page_offset) = column_chunk.index_page_offset { - // uncomment when we add geo spatial - //last_field_id = index_page_offset.write_thrift_field(w, 16, last_field_id)?; - index_page_offset.write_thrift_field(w, 16, last_field_id)?; + + // SizeStatistics + let size_stats = if column_chunk.unencoded_byte_array_data_bytes.is_some() + || column_chunk.repetition_level_histogram.is_some() + || column_chunk.definition_level_histogram.is_some() + { + let repetition_level_histogram = column_chunk + .repetition_level_histogram() + .map(|hist| hist.clone().into_inner()); + + let definition_level_histogram = column_chunk + .definition_level_histogram() + .map(|hist| hist.clone().into_inner()); + + Some(SizeStatistics { + unencoded_byte_array_data_bytes: column_chunk.unencoded_byte_array_data_bytes, + repetition_level_histogram, + definition_level_histogram, + }) + } else { + None + }; + if let Some(size_stats) = size_stats { + size_stats.write_thrift_field(w, 16, last_field_id)?; } + // TODO: field 17 geo spatial stats here w.write_struct_end() } @@ -1305,7 +1326,10 @@ fn write_schema( repetition_type: Some(basic_info.repetition()), name: basic_info.name(), num_children: None, - converted_type: basic_info.converted_type().into(), + converted_type: match basic_info.converted_type() { + ConvertedType::NONE => None, + other => Some(other), + }, scale: if *scale >= 0 { Some(*scale) } else { None }, precision: if *precision >= 0 { Some(*precision) @@ -1334,7 +1358,10 @@ fn write_schema( repetition_type: repetition, name: basic_info.name(), num_children: Some(fields.len() as i32), - converted_type: basic_info.converted_type().into(), + converted_type: match basic_info.converted_type() { + ConvertedType::NONE => None, + other => Some(other), + }, scale: None, precision: None, field_id: if basic_info.has_id() { diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index e32051c549ac..6205d9632c41 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -19,7 +19,7 @@ use crate::file::metadata::thrift_gen::{EncryptionAlgorithm, FileMeta}; use crate::file::metadata::{ ColumnChunkMetaData, ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData, }; -use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr}; +use crate::schema::types::{SchemaDescPtr, SchemaDescriptor}; use crate::thrift::TSerializable; use crate::{ basic::ColumnOrder, @@ -56,11 +56,10 @@ use thrift::protocol::TCompactOutputProtocol; /// See [`ParquetMetaDataWriter`] for background and example. pub(crate) struct ThriftMetadataWriter<'a, W: Write> { buf: &'a mut TrackedWrite, - schema: &'a TypePtr, schema_descr: &'a SchemaDescPtr, row_groups: Vec, - column_indexes: Option<&'a [Vec>]>, - offset_indexes: Option<&'a [Vec>]>, + column_indexes: Option>>>, + offset_indexes: Option>>>, key_value_metadata: Option>, created_by: Option, object_writer: MetadataObjectWriter, @@ -138,11 +137,14 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { pub fn finish(mut self) -> Result { let num_rows = self.row_groups.iter().map(|x| x.num_rows).sum(); + let column_indexes = std::mem::take(&mut self.column_indexes); + let offset_indexes = std::mem::take(&mut self.offset_indexes); + // Write column indexes and offset indexes - if let Some(column_indexes) = self.column_indexes { + if let Some(column_indexes) = column_indexes.as_ref() { self.write_column_indexes(column_indexes)?; } - if let Some(offset_indexes) = self.offset_indexes { + if let Some(offset_indexes) = offset_indexes.as_ref() { self.write_offset_indexes(offset_indexes)?; } @@ -216,28 +218,20 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { None => builder.set_row_groups(row_groups), }; - let column_indexes: Option = self.column_indexes.map(|ovvi| { - ovvi.iter() + let column_indexes: Option = column_indexes.map(|ovvi| { + ovvi.into_iter() .map(|vi| { - vi.iter() - .map(|oi| { - oi.as_ref() - .map(|i| i.clone()) - .unwrap_or(ColumnIndexMetaData::NONE) - }) + vi.into_iter() + .map(|oi| oi.unwrap_or(ColumnIndexMetaData::NONE)) .collect() }) .collect() }); // FIXME(ets): this will panic if there's a missing index. - let offset_indexes: Option = self.offset_indexes.map(|ovvi| { - ovvi.iter() - .map(|vi| { - vi.iter() - .map(|oi| oi.as_ref().map(|i| i.clone()).unwrap()) - .collect() - }) + let offset_indexes: Option = offset_indexes.map(|ovvi| { + ovvi.into_iter() + .map(|vi| vi.into_iter().map(|oi| oi.unwrap()).collect()) .collect() }); @@ -249,7 +243,6 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { pub fn new( buf: &'a mut TrackedWrite, - schema: &'a TypePtr, schema_descr: &'a SchemaDescPtr, row_groups: Vec, created_by: Option, @@ -257,7 +250,6 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { ) -> Self { Self { buf, - schema, schema_descr, row_groups, column_indexes: None, @@ -271,7 +263,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { pub fn with_column_indexes( mut self, - column_indexes: &'a [Vec>], + column_indexes: Vec>>, ) -> Self { self.column_indexes = Some(column_indexes); self @@ -279,7 +271,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { pub fn with_offset_indexes( mut self, - offset_indexes: &'a [Vec>], + offset_indexes: Vec>>, ) -> Self { self.offset_indexes = Some(offset_indexes); self @@ -418,14 +410,20 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { let mut encoder = ThriftMetadataWriter::new( &mut self.buf, - &schema, &schema_descr, row_groups, created_by, file_metadata.version(), ); - encoder = encoder.with_column_indexes(&column_indexes); - encoder = encoder.with_offset_indexes(&offset_indexes); + + if let Some(column_indexes) = column_indexes { + encoder = encoder.with_column_indexes(column_indexes); + } + + if let Some(offset_indexes) = offset_indexes { + encoder = encoder.with_offset_indexes(offset_indexes); + } + if let Some(key_value_metadata) = key_value_metadata { encoder = encoder.with_key_value_metadata(key_value_metadata); } @@ -434,32 +432,29 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { Ok(()) } - fn convert_column_indexes(&self) -> Vec>> { + fn convert_column_indexes(&self) -> Option>>> { // FIXME(ets): we're converting from ParquetColumnIndex to vec>, // but then converting back to ParquetColumnIndex in the end. need to unify this. if let Some(row_group_column_indexes) = self.metadata.column_index() { - (0..self.metadata.row_groups().len()) - .map(|rg_idx| { - let column_indexes = &row_group_column_indexes[rg_idx]; - column_indexes - .iter() - .map(|column_index| Some(column_index.clone())) - .collect() - }) - .collect() + Some( + (0..self.metadata.row_groups().len()) + .map(|rg_idx| { + let column_indexes = &row_group_column_indexes[rg_idx]; + column_indexes + .iter() + .map(|column_index| Some(column_index.clone())) + .collect() + }) + .collect(), + ) } else { - // make a None for each row group, for each column - self.metadata - .row_groups() - .iter() - .map(|rg| std::iter::repeat_n(None, rg.columns().len()).collect()) - .collect() + None } } - fn convert_offset_index(&self) -> Vec>> { + fn convert_offset_index(&self) -> Option>>> { if let Some(row_group_offset_indexes) = self.metadata.offset_index() { - (0..self.metadata.row_groups().len()) + Some((0..self.metadata.row_groups().len()) .map(|rg_idx| { let offset_indexes = &row_group_offset_indexes[rg_idx]; offset_indexes @@ -467,14 +462,9 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { .map(|offset_index| Some(offset_index.clone())) .collect() }) - .collect() + .collect()) } else { - // make a None for each row group, for each column - self.metadata - .row_groups() - .iter() - .map(|rg| std::iter::repeat_n(None, rg.columns().len()).collect()) - .collect() + None } } } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 2d311bb7fde4..1ae34401d34c 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -318,7 +318,8 @@ impl SerializedFileWriter { Ok(()) } - /// Assembles and writes metadata at the end of the file. + /// Assembles and writes metadata at the end of the file. This will take ownership + /// of `row_groups` and the page index structures. fn write_metadata(&mut self) -> Result { self.finished = true; @@ -333,11 +334,15 @@ impl SerializedFileWriter { None => Some(self.kv_metadatas.clone()), }; + // take ownership of metadata + let row_groups = std::mem::take(&mut self.row_groups); + let column_indexes = std::mem::take(&mut self.column_indexes); + let offset_indexes = std::mem::take(&mut self.offset_indexes); + let mut encoder = ThriftMetadataWriter::new( &mut self.buf, - &self.schema, &self.descr, - self.row_groups.clone(), // FIXME(ets): I really want the writer to take ownership of everything + row_groups, Some(self.props.created_by().to_string()), self.props.writer_version().as_num(), ); @@ -351,8 +356,8 @@ impl SerializedFileWriter { encoder = encoder.with_key_value_metadata(key_value_metadata) } - encoder = encoder.with_column_indexes(&self.column_indexes); - encoder = encoder.with_offset_indexes(&self.offset_indexes); + encoder = encoder.with_column_indexes(column_indexes); + encoder = encoder.with_offset_indexes(offset_indexes); encoder.finish() } diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 5c9f97ac43cd..297bdaab567b 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1128,7 +1128,7 @@ pub(crate) fn count_nodes(tp: &TypePtr, n_nodes: &mut usize) { match tp.as_ref() { Type::GroupType { ref fields, .. } => { for f in fields { - count_leaves(f, n_nodes); + count_nodes(f, n_nodes); } } _ => (), From 3092ededd51527673e001a5ce20e4434cf2ff29a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sun, 7 Sep 2025 23:32:11 -0700 Subject: [PATCH 069/126] some test fixes and cleanup --- parquet/src/arrow/arrow_writer/mod.rs | 16 +++++---- parquet/src/encryption/encrypt.rs | 52 --------------------------- parquet/src/file/metadata/mod.rs | 4 +-- parquet/src/file/metadata/writer.rs | 9 ----- parquet/src/file/writer.rs | 6 ++-- 5 files changed, 14 insertions(+), 73 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index dc5bd6b1af35..8701899c4636 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -740,7 +740,7 @@ impl ArrowColumnChunk { /// row_group_writer.close().unwrap(); /// /// let metadata = writer.close().unwrap(); -/// assert_eq!(metadata.num_rows, 3); +/// assert_eq!(metadata.file_metadata().num_rows(), 3); /// ``` pub struct ArrowColumnWriter { writer: ArrowColumnWriterImpl, @@ -3009,16 +3009,18 @@ mod tests { writer.write(&batch).unwrap(); let file_metadata = writer.close().unwrap(); + let schema = file_metadata.file_metadata().schema(); // Coerced name of "item" should be "element" - let schema = file_metadata.file_metadata().schema_descr(); - println!("schema {schema:?}"); - assert_eq!(schema.column(3).name(), "element"); + let list_field = &schema.get_fields()[0].get_fields()[0]; + assert_eq!(list_field.get_fields()[0].name(), "element"); + + let map_field = &schema.get_fields()[1].get_fields()[0]; // Coerced name of "entries" should be "key_value" - assert_eq!(schema.column(5).name(), "key_value"); + assert_eq!(map_field.name(), "key_value"); // Coerced name of "keys" should be "key" - assert_eq!(schema.column(6).name(), "key"); + assert_eq!(map_field.get_fields()[0].name(), "key"); // Coerced name of "values" should be "value" - assert_eq!(schema.column(7).name(), "value"); + assert_eq!(map_field.get_fields()[1].name(), "value"); // Double check schema after reading from the file let reader = SerializedFileReader::new(file).unwrap(); diff --git a/parquet/src/encryption/encrypt.rs b/parquet/src/encryption/encrypt.rs index fe9580551af3..1a22abff56fa 100644 --- a/parquet/src/encryption/encrypt.rs +++ b/parquet/src/encryption/encrypt.rs @@ -24,11 +24,9 @@ use crate::errors::{ParquetError, Result}; use crate::file::column_crypto_metadata::{ColumnCryptoMetaData, EncryptionWithColumnKey}; use crate::parquet_thrift::{ThriftCompactOutputProtocol, WriteThrift}; use crate::schema::types::{ColumnDescPtr, SchemaDescriptor}; -use crate::thrift::TSerializable; use ring::rand::{SecureRandom, SystemRandom}; use std::collections::{HashMap, HashSet}; use std::io::Write; -use thrift::protocol::TCompactOutputProtocol; #[derive(Debug, Clone, PartialEq)] struct EncryptionKey { @@ -365,18 +363,6 @@ impl FileEncryptor { } } -/// Write an encrypted Thrift serializable object -pub(crate) fn encrypt_object( - object: &T, - encryptor: &mut Box, - sink: &mut W, - module_aad: &[u8], -) -> Result<()> { - let encrypted_buffer = encrypt_object_to_vec(object, encryptor, module_aad)?; - sink.write_all(&encrypted_buffer)?; - Ok(()) -} - /// Write an encrypted Thrift serializable object pub(crate) fn encrypt_thrift_object( object: &T, @@ -389,29 +375,6 @@ pub(crate) fn encrypt_thrift_object( Ok(()) } -pub(crate) fn write_signed_plaintext_object( - object: &T, - encryptor: &mut Box, - sink: &mut W, - module_aad: &[u8], -) -> Result<()> { - let mut buffer: Vec = vec![]; - { - let mut protocol = TCompactOutputProtocol::new(&mut buffer); - object.write_to_out_protocol(&mut protocol)?; - } - sink.write_all(&buffer)?; - buffer = encryptor.encrypt(buffer.as_ref(), module_aad)?; - - // Format of encrypted buffer is: [ciphertext size, nonce, ciphertext, authentication tag] - let nonce = &buffer[SIZE_LEN..SIZE_LEN + NONCE_LEN]; - let tag = &buffer[buffer.len() - TAG_LEN..]; - sink.write_all(nonce)?; - sink.write_all(tag)?; - - Ok(()) -} - pub(crate) fn write_signed_plaintext_thrift_object( object: &T, encryptor: &mut Box, @@ -435,21 +398,6 @@ pub(crate) fn write_signed_plaintext_thrift_object( Ok(()) } -/// Encrypt a Thrift serializable object to a byte vector -pub(crate) fn encrypt_object_to_vec( - object: &T, - encryptor: &mut Box, - module_aad: &[u8], -) -> Result> { - let mut buffer: Vec = vec![]; - { - let mut unencrypted_protocol = TCompactOutputProtocol::new(&mut buffer); - object.write_to_out_protocol(&mut unencrypted_protocol)?; - } - - encryptor.encrypt(buffer.as_ref(), module_aad) -} - /// Encrypt a Thrift serializable object to a byte vector pub(crate) fn encrypt_thrift_object_to_vec( object: &T, diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 03745a53effc..57949a81a4f7 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -2068,7 +2068,7 @@ mod tests { #[cfg(not(feature = "encryption"))] let base_expected_size = 2280; #[cfg(feature = "encryption")] - let base_expected_size = 2616; + let base_expected_size = 2712; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -2108,7 +2108,7 @@ mod tests { #[cfg(not(feature = "encryption"))] let bigger_expected_size = 2704; #[cfg(feature = "encryption")] - let bigger_expected_size = 3040; + let bigger_expected_size = 3136; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 6205d9632c41..52ec092cc504 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -20,7 +20,6 @@ use crate::file::metadata::{ ColumnChunkMetaData, ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData, }; use crate::schema::types::{SchemaDescPtr, SchemaDescriptor}; -use crate::thrift::TSerializable; use crate::{ basic::ColumnOrder, file::metadata::{FileMetaData, ParquetMetaDataBuilder}, @@ -49,7 +48,6 @@ use crate::{ }; use std::io::Write; use std::sync::Arc; -use thrift::protocol::TCompactOutputProtocol; /// Writes `crate::file::metadata` structures to a thrift encoded byte stream /// @@ -476,13 +474,6 @@ struct MetadataObjectWriter { } impl MetadataObjectWriter { - #[inline] - fn write_object(object: &impl TSerializable, sink: impl Write) -> Result<()> { - let mut protocol = TCompactOutputProtocol::new(sink); - object.write_to_out_protocol(&mut protocol)?; - Ok(()) - } - #[inline] fn write_thrift_object(object: &impl WriteThrift, sink: impl Write) -> Result<()> { let mut protocol = ThriftCompactOutputProtocol::new(sink); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 1ae34401d34c..8ba677274947 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -149,7 +149,6 @@ pub type OnCloseRowGroup<'a, W> = Box< /// - After all row groups have been written, close the file writer using `close` method. pub struct SerializedFileWriter { buf: TrackedWrite, - schema: TypePtr, descr: SchemaDescPtr, props: WriterPropertiesPtr, row_groups: Vec, @@ -189,7 +188,6 @@ impl SerializedFileWriter { Self::start_file(&properties, &mut buf)?; Ok(Self { buf, - schema, descr: Arc::new(schema_descriptor), props: properties, row_groups: vec![], @@ -357,7 +355,9 @@ impl SerializedFileWriter { } encoder = encoder.with_column_indexes(column_indexes); - encoder = encoder.with_offset_indexes(offset_indexes); + if !self.props.offset_index_disabled() { + encoder = encoder.with_offset_indexes(offset_indexes); + } encoder.finish() } From da66845b44f7d0605849af99aee304c6bba808fa Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sun, 7 Sep 2025 23:34:23 -0700 Subject: [PATCH 070/126] more fixes and cleanup --- parquet/src/file/metadata/thrift_gen.rs | 4 ++-- parquet/src/file/metadata/writer.rs | 20 +++++++++++--------- parquet/tests/encryption/encryption.rs | 20 +++++++------------- parquet/tests/encryption/encryption_async.rs | 4 ++-- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index f14859893b85..03ce8558e582 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -1223,8 +1223,8 @@ pub(crate) fn serialize_column_meta_data( // SizeStatistics let size_stats = if column_chunk.unencoded_byte_array_data_bytes.is_some() - || column_chunk.repetition_level_histogram.is_some() - || column_chunk.definition_level_histogram.is_some() + || column_chunk.repetition_level_histogram.is_some() + || column_chunk.definition_level_histogram.is_some() { let repetition_level_histogram = column_chunk .repetition_level_histogram() diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 52ec092cc504..5a1c44b50539 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -452,15 +452,17 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { fn convert_offset_index(&self) -> Option>>> { if let Some(row_group_offset_indexes) = self.metadata.offset_index() { - Some((0..self.metadata.row_groups().len()) - .map(|rg_idx| { - let offset_indexes = &row_group_offset_indexes[rg_idx]; - offset_indexes - .iter() - .map(|offset_index| Some(offset_index.clone())) - .collect() - }) - .collect()) + Some( + (0..self.metadata.row_groups().len()) + .map(|rg_idx| { + let offset_indexes = &row_group_offset_indexes[rg_idx]; + offset_indexes + .iter() + .map(|offset_index| Some(offset_index.clone())) + .collect() + }) + .collect(), + ) } else { None } diff --git a/parquet/tests/encryption/encryption.rs b/parquet/tests/encryption/encryption.rs index 96dd8654cd76..0261c22c2c2d 100644 --- a/parquet/tests/encryption/encryption.rs +++ b/parquet/tests/encryption/encryption.rs @@ -982,23 +982,17 @@ pub fn test_retrieve_row_group_statistics_after_encrypted_write() { } let file_metadata = writer.close().unwrap(); - assert_eq!(file_metadata.row_groups.len(), 1); - let row_group = &file_metadata.row_groups[0]; - assert_eq!(row_group.columns.len(), 1); - let column = &row_group.columns[0]; - let column_stats = column - .meta_data - .as_ref() - .unwrap() - .statistics - .as_ref() - .unwrap(); + assert_eq!(file_metadata.num_row_groups(), 1); + let row_group = file_metadata.row_group(0); + assert_eq!(row_group.num_columns(), 1); + let column = row_group.column(0); + let column_stats = column.statistics().unwrap(); assert_eq!( - column_stats.min_value.as_deref(), + column_stats.min_bytes_opt(), Some(3i32.to_le_bytes().as_slice()) ); assert_eq!( - column_stats.max_value.as_deref(), + column_stats.max_bytes_opt(), Some(19i32.to_le_bytes().as_slice()) ); } diff --git a/parquet/tests/encryption/encryption_async.rs b/parquet/tests/encryption/encryption_async.rs index af107f1e2610..8672754c2d91 100644 --- a/parquet/tests/encryption/encryption_async.rs +++ b/parquet/tests/encryption/encryption_async.rs @@ -578,8 +578,8 @@ async fn test_multi_threaded_encrypted_writing() { // Close the file writer which writes the footer let metadata = writer.finish().unwrap(); - assert_eq!(metadata.num_rows, 100); - assert_eq!(metadata.schema, metadata.schema); + assert_eq!(metadata.file_metadata().num_rows(), 100); + // TODO(ets): wut? assert_eq!(metadata.schema, metadata.schema); // Check that the file was written correctly let (read_record_batches, read_metadata) = From 9ab7bb0b7beab28ba4ebeb9e2189bd8a33f49567 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 8 Sep 2025 08:10:54 -0700 Subject: [PATCH 071/126] clippy fixes --- parquet/benches/metadata.rs | 44 ++++++++++++------------- parquet/src/file/metadata/thrift_gen.rs | 4 +-- parquet/src/file/metadata/writer.rs | 24 ++++++-------- parquet/src/file/writer.rs | 2 +- parquet/src/schema/types.rs | 9 ++--- 5 files changed, 37 insertions(+), 46 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index ced0175da878..1992a6868f43 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#[cfg(feature = "arrow")] +use parquet::file::metadata::ParquetMetaData; use parquet::file::metadata::ParquetMetaDataReader; use rand::Rng; use thrift::protocol::TCompactOutputProtocol; @@ -164,7 +166,7 @@ fn get_footer_bytes(data: Bytes) -> Bytes { } #[cfg(feature = "arrow")] -fn rewrite_file(bytes: Bytes) -> (Bytes, FileMetaData) { +fn rewrite_file(bytes: Bytes) -> (Bytes, ParquetMetaData) { use arrow::array::RecordBatchReader; use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter}; use parquet::file::properties::{EnabledStatistics, WriterProperties}; @@ -242,40 +244,36 @@ fn criterion_benchmark(c: &mut Criterion) { #[cfg(feature = "arrow")] c.bench_function("page headers", |b| { b.iter(|| { - metadata.row_groups.iter().for_each(|rg| { - rg.columns.iter().for_each(|col| { - if let Some(col_meta) = &col.meta_data { - if let Some(dict_offset) = col_meta.dictionary_page_offset { - parquet::thrift::bench_page_header( - &file_bytes.slice(dict_offset as usize..), - ); - } + for rg in metadata.row_groups() { + for col in rg.columns() { + if let Some(dict_offset) = col.dictionary_page_offset() { parquet::thrift::bench_page_header( - &file_bytes.slice(col_meta.data_page_offset as usize..), + &file_bytes.slice(dict_offset as usize..), ); } - }); - }); + parquet::thrift::bench_page_header( + &file_bytes.slice(col.data_page_offset() as usize..), + ); + } + } }) }); #[cfg(feature = "arrow")] c.bench_function("page headers (no stats)", |b| { b.iter(|| { - metadata.row_groups.iter().for_each(|rg| { - rg.columns.iter().for_each(|col| { - if let Some(col_meta) = &col.meta_data { - if let Some(dict_offset) = col_meta.dictionary_page_offset { - parquet::thrift::bench_page_header_no_stats( - &file_bytes.slice(dict_offset as usize..), - ); - } + for rg in metadata.row_groups() { + for col in rg.columns() { + if let Some(dict_offset) = col.dictionary_page_offset() { parquet::thrift::bench_page_header_no_stats( - &file_bytes.slice(col_meta.data_page_offset as usize..), + &file_bytes.slice(dict_offset as usize..), ); } - }); - }); + parquet::thrift::bench_page_header_no_stats( + &file_bytes.slice(col.data_page_offset() as usize..), + ); + } + } }) }); } diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 03ce8558e582..278734200c3a 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -1444,7 +1444,7 @@ impl WriteThrift for ColumnChunkMetaData { // only write the ColumnMetaData if we haven't already encrypted it if self.encrypted_column_metadata.is_none() { writer.write_field_begin(FieldType::Struct, 3, last_field_id)?; - serialize_column_meta_data(&self, writer)?; + serialize_column_meta_data(self, writer)?; last_field_id = 3; } @@ -1488,7 +1488,7 @@ impl WriteThrift for ColumnChunkMetaData { // always write the ColumnMetaData writer.write_field_begin(FieldType::Struct, 3, last_field_id)?; - serialize_column_meta_data(&self, writer)?; + serialize_column_meta_data(self, writer)?; last_field_id = 3; if let Some(offset_idx_off) = self.offset_index_offset() { diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 5a1c44b50539..031223959509 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -433,8 +433,9 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { fn convert_column_indexes(&self) -> Option>>> { // FIXME(ets): we're converting from ParquetColumnIndex to vec>, // but then converting back to ParquetColumnIndex in the end. need to unify this. - if let Some(row_group_column_indexes) = self.metadata.column_index() { - Some( + self.metadata + .column_index() + .map(|row_group_column_indexes| { (0..self.metadata.row_groups().len()) .map(|rg_idx| { let column_indexes = &row_group_column_indexes[rg_idx]; @@ -443,16 +444,14 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { .map(|column_index| Some(column_index.clone())) .collect() }) - .collect(), - ) - } else { - None - } + .collect() + }) } fn convert_offset_index(&self) -> Option>>> { - if let Some(row_group_offset_indexes) = self.metadata.offset_index() { - Some( + self.metadata + .offset_index() + .map(|row_group_offset_indexes| { (0..self.metadata.row_groups().len()) .map(|rg_idx| { let offset_indexes = &row_group_offset_indexes[rg_idx]; @@ -461,11 +460,8 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { .map(|offset_index| Some(offset_index.clone())) .collect() }) - .collect(), - ) - } else { - None - } + .collect() + }) } } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 8ba677274947..1963af8bb1ba 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -2098,7 +2098,7 @@ mod tests { unenc_size, meta_data.unencoded_byte_array_data_bytes().unwrap() ); - check_def_hist(&meta_data.definition_level_histogram().unwrap().values()); + check_def_hist(meta_data.definition_level_histogram().unwrap().values()); // check that the read metadata is also correct let options = ReadOptionsBuilder::new().with_page_index().build(); diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 297bdaab567b..dca0f8417957 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1125,13 +1125,10 @@ pub(crate) fn num_nodes(tp: &TypePtr) -> usize { pub(crate) fn count_nodes(tp: &TypePtr, n_nodes: &mut usize) { *n_nodes += 1; - match tp.as_ref() { - Type::GroupType { ref fields, .. } => { - for f in fields { - count_nodes(f, n_nodes); - } + if let Type::GroupType { ref fields, .. } = tp.as_ref() { + for f in fields { + count_nodes(f, n_nodes); } - _ => (), } } From 544eca01f259efebaeea3f59f634fab464b8c352 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 8 Sep 2025 13:19:04 -0700 Subject: [PATCH 072/126] start removing references to format --- parquet/src/file/metadata/mod.rs | 347 +++--------------------- parquet/src/file/metadata/thrift_gen.rs | 38 ++- parquet/src/file/writer.rs | 14 +- 3 files changed, 73 insertions(+), 326 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 57949a81a4f7..b1ec7f8646b7 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -99,15 +99,14 @@ mod writer; #[cfg(feature = "encryption")] use crate::encryption::decrypt::FileDecryptor; #[cfg(feature = "encryption")] -use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData}; +use crate::file::column_crypto_metadata::ColumnCryptoMetaData; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex}; -use crate::file::statistics::{self, Statistics}; +use crate::file::statistics::Statistics; use crate::file::{ - page_encoding_stats::{self, PageEncodingStats}, + page_encoding_stats::PageEncodingStats, page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation}, }; -use crate::format::ColumnCryptoMetaData as TColumnCryptoMetaData; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, Type as SchemaType, @@ -563,26 +562,6 @@ pub struct SortingColumn { } ); -impl From<&crate::format::SortingColumn> for SortingColumn { - fn from(value: &crate::format::SortingColumn) -> Self { - Self { - column_idx: value.column_idx, - descending: value.descending, - nulls_first: value.nulls_first, - } - } -} - -impl From<&SortingColumn> for crate::format::SortingColumn { - fn from(value: &SortingColumn) -> Self { - Self { - column_idx: value.column_idx, - descending: value.descending, - nulls_first: value.nulls_first, - } - } -} - /// Reference counted pointer for [`RowGroupMetaData`]. pub type RowGroupMetaDataPtr = Arc; @@ -674,60 +653,6 @@ impl RowGroupMetaData { self.file_offset } - /// Method to convert from Thrift. - pub fn from_thrift( - schema_descr: SchemaDescPtr, - mut rg: crate::format::RowGroup, - ) -> Result { - if schema_descr.num_columns() != rg.columns.len() { - return Err(general_err!( - "Column count mismatch. Schema has {} columns while Row Group has {}", - schema_descr.num_columns(), - rg.columns.len() - )); - } - let total_byte_size = rg.total_byte_size; - let num_rows = rg.num_rows; - let mut columns = vec![]; - - for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) { - columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?); - } - - let sorting_columns = rg.sorting_columns.map(|scs| { - scs.iter() - .map(|sc| sc.into()) - .collect::>() - }); - Ok(RowGroupMetaData { - columns, - num_rows, - sorting_columns, - total_byte_size, - schema_descr, - file_offset: rg.file_offset, - ordinal: rg.ordinal, - }) - } - - /// Method to convert to Thrift. - pub fn to_thrift(&self) -> crate::format::RowGroup { - let sorting_columns = self.sorting_columns().map(|scs| { - scs.iter() - .map(|sc| sc.into()) - .collect::>() - }); - crate::format::RowGroup { - columns: self.columns().iter().map(|v| v.to_thrift()).collect(), - total_byte_size: self.total_byte_size, - num_rows: self.num_rows, - sorting_columns, - file_offset: self.file_offset(), - total_compressed_size: Some(self.compressed_size()), - ordinal: self.ordinal, - } - } - /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`] pub fn into_builder(self) -> RowGroupMetaDataBuilder { RowGroupMetaDataBuilder(self) @@ -1140,183 +1065,10 @@ impl ColumnChunkMetaData { self.column_crypto_metadata.as_ref() } - /// Method to convert from Thrift. - pub fn from_thrift( - column_descr: ColumnDescPtr, - cc: crate::format::ColumnChunk, - ) -> Result { - if cc.meta_data.is_none() { - return Err(general_err!("Expected to have column metadata")); - } - let mut col_metadata: crate::format::ColumnMetaData = cc.meta_data.unwrap(); - let column_type = Type::try_from(col_metadata.type_)?; - let encodings = col_metadata - .encodings - .drain(0..) - .map(Encoding::try_from) - .collect::>()?; - let compression = Compression::try_from(col_metadata.codec)?; - let file_path = cc.file_path; - let file_offset = cc.file_offset; - let num_values = col_metadata.num_values; - let total_compressed_size = col_metadata.total_compressed_size; - let total_uncompressed_size = col_metadata.total_uncompressed_size; - let data_page_offset = col_metadata.data_page_offset; - let index_page_offset = col_metadata.index_page_offset; - let dictionary_page_offset = col_metadata.dictionary_page_offset; - let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?; - let encoding_stats = col_metadata - .encoding_stats - .as_ref() - .map(|vec| { - vec.iter() - .map(page_encoding_stats::try_from_thrift) - .collect::>() - }) - .transpose()?; - let bloom_filter_offset = col_metadata.bloom_filter_offset; - let bloom_filter_length = col_metadata.bloom_filter_length; - let offset_index_offset = cc.offset_index_offset; - let offset_index_length = cc.offset_index_length; - let column_index_offset = cc.column_index_offset; - let column_index_length = cc.column_index_length; - let ( - unencoded_byte_array_data_bytes, - repetition_level_histogram, - definition_level_histogram, - ) = if let Some(size_stats) = col_metadata.size_statistics { - ( - size_stats.unencoded_byte_array_data_bytes, - size_stats.repetition_level_histogram, - size_stats.definition_level_histogram, - ) - } else { - (None, None, None) - }; - - let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from); - let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from); - - #[cfg(feature = "encryption")] - let column_crypto_metadata = if let Some(crypto_metadata) = cc.crypto_metadata { - Some(column_crypto_metadata::try_from_thrift(&crypto_metadata)?) - } else { - None - }; - - let result = ColumnChunkMetaData { - column_descr, - encodings, - file_path, - file_offset, - num_values, - compression, - total_compressed_size, - total_uncompressed_size, - data_page_offset, - index_page_offset, - dictionary_page_offset, - statistics, - encoding_stats, - bloom_filter_offset, - bloom_filter_length, - offset_index_offset, - offset_index_length, - column_index_offset, - column_index_length, - unencoded_byte_array_data_bytes, - repetition_level_histogram, - definition_level_histogram, - #[cfg(feature = "encryption")] - column_crypto_metadata, - #[cfg(feature = "encryption")] - encrypted_column_metadata: None, - }; - Ok(result) - } - - /// Method to convert to Thrift. - pub fn to_thrift(&self) -> crate::format::ColumnChunk { - let column_metadata = self.to_column_metadata_thrift(); - - crate::format::ColumnChunk { - file_path: self.file_path().map(|s| s.to_owned()), - file_offset: self.file_offset, - meta_data: Some(column_metadata), - offset_index_offset: self.offset_index_offset, - offset_index_length: self.offset_index_length, - column_index_offset: self.column_index_offset, - column_index_length: self.column_index_length, - crypto_metadata: self.column_crypto_metadata_thrift(), - encrypted_column_metadata: None, - } - } - - /// Method to convert to Thrift `ColumnMetaData` - pub fn to_column_metadata_thrift(&self) -> crate::format::ColumnMetaData { - let size_statistics = if self.unencoded_byte_array_data_bytes.is_some() - || self.repetition_level_histogram.is_some() - || self.definition_level_histogram.is_some() - { - let repetition_level_histogram = self - .repetition_level_histogram - .as_ref() - .map(|hist| hist.clone().into_inner()); - - let definition_level_histogram = self - .definition_level_histogram - .as_ref() - .map(|hist| hist.clone().into_inner()); - - Some(crate::format::SizeStatistics { - unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes, - repetition_level_histogram, - definition_level_histogram, - }) - } else { - None - }; - - crate::format::ColumnMetaData { - type_: self.column_type().into(), - encodings: self.encodings().iter().map(|&v| v.into()).collect(), - path_in_schema: self.column_path().as_ref().to_vec(), - codec: self.compression.into(), - num_values: self.num_values, - total_uncompressed_size: self.total_uncompressed_size, - total_compressed_size: self.total_compressed_size, - key_value_metadata: None, - data_page_offset: self.data_page_offset, - index_page_offset: self.index_page_offset, - dictionary_page_offset: self.dictionary_page_offset, - statistics: statistics::to_thrift(self.statistics.as_ref()), - encoding_stats: self - .encoding_stats - .as_ref() - .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), - bloom_filter_offset: self.bloom_filter_offset, - bloom_filter_length: self.bloom_filter_length, - size_statistics, - geospatial_statistics: None, - } - } - /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`] pub fn into_builder(self) -> ColumnChunkMetaDataBuilder { ColumnChunkMetaDataBuilder::from(self) } - - #[cfg(feature = "encryption")] - fn column_crypto_metadata_thrift(&self) -> Option { - self.column_crypto_metadata - .as_ref() - .map(column_crypto_metadata::to_thrift) - } - - #[cfg(not(feature = "encryption"))] - fn column_crypto_metadata_thrift(&self) -> Option { - None - } } /// Builder for [`ColumnChunkMetaData`] @@ -1613,21 +1365,6 @@ impl ColumnIndexBuilder { self.valid } - /// Build and get the thrift metadata of column index - /// - /// Note: callers should check [`Self::valid`] before calling this method - pub fn build_to_thrift(self) -> crate::format::ColumnIndex { - crate::format::ColumnIndex::new( - self.null_pages, - self.min_values, - self.max_values, - self.boundary_order.into(), - self.null_counts, - self.repetition_level_histograms, - self.definition_level_histograms, - ) - } - /// Build and get the column index /// /// Note: callers should check [`Self::valid`] before calling this method @@ -1762,20 +1499,6 @@ impl OffsetIndexBuilder { } } - /// Build and get the thrift metadata of offset index - pub fn build_to_thrift(self) -> crate::format::OffsetIndex { - let locations = self - .offset_array - .iter() - .zip(self.compressed_page_size_array.iter()) - .zip(self.first_row_index_array.iter()) - .map(|((offset, size), row_index)| { - crate::format::PageLocation::new(*offset, *size, *row_index) - }) - .collect::>(); - crate::format::OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array) - } - /// Build and get the thrift metadata of offset index pub fn build(self) -> OffsetIndexMetaData { let locations = self @@ -1800,7 +1523,7 @@ impl OffsetIndexBuilder { mod tests { use super::*; use crate::basic::{PageType, SortOrder}; - use crate::file::page_index::column_index::{ColumnIndex, PrimitiveColumnIndex}; + use crate::file::metadata::thrift_gen::tests::{read_column_chunk, read_row_group}; #[test] fn test_row_group_metadata_thrift_conversion() { @@ -1819,12 +1542,13 @@ mod tests { .build() .unwrap(); - let row_group_exp = row_group_meta.to_thrift(); - let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone()) - .unwrap() - .to_thrift(); + let mut buf = Vec::new(); + let mut writer = ThriftCompactOutputProtocol::new(&mut buf); + row_group_meta.write_thrift(&mut writer).unwrap(); - assert_eq!(row_group_res, row_group_exp); + let row_group_res = read_row_group(&mut buf, schema_descr).unwrap(); + + assert_eq!(row_group_res, row_group_meta); } #[test] @@ -1900,11 +1624,13 @@ mod tests { .set_ordinal(1) .build() .unwrap(); + let mut buf = Vec::new(); + let mut writer = ThriftCompactOutputProtocol::new(&mut buf); + row_group_meta_2cols.write_thrift(&mut writer).unwrap(); - let err = - RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift()) - .unwrap_err() - .to_string(); + let err = read_row_group(&mut buf, schema_descr_3cols) + .unwrap_err() + .to_string(); assert_eq!( err, "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2" @@ -1948,8 +1674,10 @@ mod tests { .build() .unwrap(); - let col_chunk_res = - ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap(); + let mut buf = Vec::new(); + let mut writer = ThriftCompactOutputProtocol::new(&mut buf); + col_metadata.write_thrift(&mut writer).unwrap(); + let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap(); assert_eq!(col_chunk_res, col_metadata); } @@ -1962,12 +1690,12 @@ mod tests { .build() .unwrap(); - let col_chunk_exp = col_metadata.to_thrift(); - let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone()) - .unwrap() - .to_thrift(); + let mut buf = Vec::new(); + let mut writer = ThriftCompactOutputProtocol::new(&mut buf); + col_metadata.write_thrift(&mut writer).unwrap(); + let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap(); - assert_eq!(col_chunk_res, col_chunk_exp); + assert_eq!(col_chunk_res, col_metadata); } #[test] @@ -2074,17 +1802,10 @@ mod tests { let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN); column_index.append(false, vec![1u8], vec![2u8, 3u8], 4); - let column_index = column_index.build_to_thrift(); - let native_index = PrimitiveColumnIndex:: { - column_index: ColumnIndex { - null_pages: column_index.null_pages, - boundary_order: column_index.boundary_order.try_into().unwrap(), - null_counts: column_index.null_counts, - repetition_level_histograms: column_index.repetition_level_histograms, - definition_level_histograms: column_index.definition_level_histograms, - }, - min_values: vec![], - max_values: vec![], + let column_index = column_index.build().unwrap(); + let native_index = match column_index { + ColumnIndexMetaData::BOOLEAN(index) => index, + _ => panic!("wrong type of column index"), }; // Now, add in OffsetIndex @@ -2095,20 +1816,18 @@ mod tests { offset_index.append_row_count(1); offset_index.append_offset_and_size(2, 3); offset_index.append_unencoded_byte_array_data_bytes(Some(10)); - let offset_index = offset_index.build_to_thrift(); + let offset_index = offset_index.build(); let parquet_meta = ParquetMetaDataBuilder::new(file_metadata) .set_row_groups(row_group_meta) .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]])) - .set_offset_index(Some(vec![vec![ - OffsetIndexMetaData::try_new(offset_index).unwrap() - ]])) + .set_offset_index(Some(vec![vec![offset_index]])) .build(); #[cfg(not(feature = "encryption"))] - let bigger_expected_size = 2704; + let bigger_expected_size = 2706; #[cfg(feature = "encryption")] - let bigger_expected_size = 3136; + let bigger_expected_size = 3138; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 278734200c3a..bde1b67abd01 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -267,6 +267,14 @@ fn convert_row_group( row_group: RowGroup, schema_descr: Arc, ) -> Result { + if schema_descr.num_columns() != row_group.columns.len() { + return Err(general_err!( + "Column count mismatch. Schema has {} columns while Row Group has {}", + schema_descr.num_columns(), + row_group.columns.len() + )); + } + let num_rows = row_group.num_rows; let sorting_columns = row_group.sorting_columns; let total_byte_size = row_group.total_byte_size; @@ -1509,9 +1517,35 @@ impl WriteThrift for ColumnChunkMetaData { } #[cfg(test)] -mod tests { - use crate::file::metadata::thrift_gen::BoundingBox; +pub(crate) mod tests { + use crate::errors::Result; + use crate::file::metadata::thrift_gen::{ + convert_column, convert_row_group, BoundingBox, ColumnChunk, RowGroup, + }; + use crate::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; use crate::parquet_thrift::tests::test_roundtrip; + use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; + use crate::schema::types::{ColumnDescriptor, SchemaDescriptor}; + use std::sync::Arc; + + // for testing. decode thrift encoded RowGroup + pub(crate) fn read_row_group( + buf: &mut [u8], + schema_descr: Arc, + ) -> Result { + let mut reader = ThriftSliceInputProtocol::new(buf); + let rg = RowGroup::read_thrift(&mut reader)?; + convert_row_group(rg, schema_descr) + } + + pub(crate) fn read_column_chunk( + buf: &mut [u8], + column_descr: Arc, + ) -> Result { + let mut reader = ThriftSliceInputProtocol::new(buf); + let cc = ColumnChunk::read_thrift(&mut reader)?; + convert_column(cc, column_descr) + } #[test] fn test_bounding_box_roundtrip() { diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 1963af8bb1ba..99e0e7e7fb80 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1027,6 +1027,7 @@ mod tests { use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::properties::EnabledStatistics; use crate::file::serialized_reader::ReadOptionsBuilder; + use crate::file::statistics::page_stats_to_thrift; use crate::file::{ properties::{ReaderProperties, WriterProperties, WriterVersion}, reader::{FileReader, SerializedFileReader, SerializedPageReader}, @@ -1870,29 +1871,22 @@ mod tests { let metadata = row_group_writer.close().unwrap(); writer.close().unwrap(); - let thrift = metadata.to_thrift(); - let encoded_stats: Vec<_> = thrift - .columns - .into_iter() - .map(|x| x.meta_data.unwrap().statistics.unwrap()) - .collect(); - // decimal - let s = &encoded_stats[0]; + let s = page_stats_to_thrift(metadata.column(0).statistics()).unwrap(); assert_eq!(s.min.as_deref(), Some(1_i32.to_le_bytes().as_ref())); assert_eq!(s.max.as_deref(), Some(3_i32.to_le_bytes().as_ref())); assert_eq!(s.min_value.as_deref(), Some(1_i32.to_le_bytes().as_ref())); assert_eq!(s.max_value.as_deref(), Some(3_i32.to_le_bytes().as_ref())); // i32 - let s = &encoded_stats[1]; + let s = page_stats_to_thrift(metadata.column(1).statistics()).unwrap(); assert_eq!(s.min.as_deref(), Some(1_i32.to_le_bytes().as_ref())); assert_eq!(s.max.as_deref(), Some(3_i32.to_le_bytes().as_ref())); assert_eq!(s.min_value.as_deref(), Some(1_i32.to_le_bytes().as_ref())); assert_eq!(s.max_value.as_deref(), Some(3_i32.to_le_bytes().as_ref())); // u32 - let s = &encoded_stats[2]; + let s = page_stats_to_thrift(metadata.column(2).statistics()).unwrap(); assert_eq!(s.min.as_deref(), None); assert_eq!(s.max.as_deref(), None); assert_eq!(s.min_value.as_deref(), Some(1_i32.to_le_bytes().as_ref())); From 0b33d259459406b4b0fc66a537de22a35050db4b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 8 Sep 2025 13:26:23 -0700 Subject: [PATCH 073/126] more format cleanup --- parquet/src/file/column_crypto_metadata.rs | 55 ---- parquet/src/file/metadata/thrift_gen.rs | 60 +++- parquet/src/file/page_encoding_stats.rs | 46 +--- parquet/src/file/page_index/offset_index.rs | 32 --- parquet/src/schema/types.rs | 291 +++----------------- parquet/tests/arrow_reader/io/mod.rs | 10 +- 6 files changed, 95 insertions(+), 399 deletions(-) diff --git a/parquet/src/file/column_crypto_metadata.rs b/parquet/src/file/column_crypto_metadata.rs index 6a538bd42bc0..bba28bc4b019 100644 --- a/parquet/src/file/column_crypto_metadata.rs +++ b/parquet/src/file/column_crypto_metadata.rs @@ -20,11 +20,6 @@ use std::io::Write; use crate::errors::{ParquetError, Result}; -use crate::format::{ - ColumnCryptoMetaData as TColumnCryptoMetaData, - EncryptionWithColumnKey as TEncryptionWithColumnKey, - EncryptionWithFooterKey as TEncryptionWithFooterKey, -}; use crate::parquet_thrift::{ read_thrift_vec, ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift, WriteThriftField, @@ -53,61 +48,11 @@ union ColumnCryptoMetaData { } ); -/// Converts Thrift definition into `ColumnCryptoMetadata`. -pub fn try_from_thrift( - thrift_column_crypto_metadata: &TColumnCryptoMetaData, -) -> Result { - let crypto_metadata = match thrift_column_crypto_metadata { - TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_) => { - ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY - } - TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(encryption_with_column_key) => { - ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(EncryptionWithColumnKey { - path_in_schema: encryption_with_column_key.path_in_schema.clone(), - key_metadata: encryption_with_column_key.key_metadata.clone(), - }) - } - }; - Ok(crypto_metadata) -} - -/// Converts `ColumnCryptoMetadata` into Thrift definition. -pub fn to_thrift(column_crypto_metadata: &ColumnCryptoMetaData) -> TColumnCryptoMetaData { - match column_crypto_metadata { - ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY => { - TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(TEncryptionWithFooterKey {}) - } - ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(encryption_with_column_key) => { - TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(TEncryptionWithColumnKey { - path_in_schema: encryption_with_column_key.path_in_schema.clone(), - key_metadata: encryption_with_column_key.key_metadata.clone(), - }) - } - } -} - #[cfg(test)] mod tests { use super::*; use crate::parquet_thrift::tests::test_roundtrip; - #[test] - fn test_encryption_with_footer_key_from_thrift() { - let metadata = ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY; - - assert_eq!(try_from_thrift(&to_thrift(&metadata)).unwrap(), metadata); - } - - #[test] - fn test_encryption_with_column_key_from_thrift() { - let metadata = ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(EncryptionWithColumnKey { - path_in_schema: vec!["abc".to_owned(), "def".to_owned()], - key_metadata: Some(vec![0, 1, 2, 3, 4, 5]), - }); - - assert_eq!(try_from_thrift(&to_thrift(&metadata)).unwrap(), metadata); - } - #[test] fn test_column_crypto_roundtrip() { test_roundtrip(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY); diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index bde1b67abd01..9fbfcbb0f45d 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -1278,7 +1278,7 @@ impl<'a> WriteThrift for FileMeta<'a> { // field 2 is schema. do depth-first traversal of tree, converting to SchemaElement and // writing along the way. let root = self.file_metadata.schema_descr().root_schema_ptr(); - let schema_len = num_nodes(&root); + let schema_len = num_nodes(&root)?; writer.write_field_begin(FieldType::List, 2, 1)?; writer.write_list_begin(ElementType::Struct, schema_len)?; // recursively write Type nodes as SchemaElements @@ -1313,6 +1313,16 @@ impl<'a> WriteThrift for FileMeta<'a> { } fn write_schema( + schema: &TypePtr, + writer: &mut ThriftCompactOutputProtocol, +) -> Result<()> { + if !schema.is_group() { + return Err(general_err!("Root schema must be Group type")); + } + write_schema_helper(schema, writer) +} + +fn write_schema_helper( node: &TypePtr, writer: &mut ThriftCompactOutputProtocol, ) -> Result<()> { @@ -1384,7 +1394,7 @@ fn write_schema( // Add child elements for a group for field in fields { - write_schema(field, writer)?; + write_schema_helper(field, writer)?; } Ok(()) } @@ -1520,12 +1530,18 @@ impl WriteThrift for ColumnChunkMetaData { pub(crate) mod tests { use crate::errors::Result; use crate::file::metadata::thrift_gen::{ - convert_column, convert_row_group, BoundingBox, ColumnChunk, RowGroup, + convert_column, convert_row_group, write_schema, BoundingBox, ColumnChunk, RowGroup, + SchemaElement, }; use crate::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; use crate::parquet_thrift::tests::test_roundtrip; - use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; - use crate::schema::types::{ColumnDescriptor, SchemaDescriptor}; + use crate::parquet_thrift::{ + read_thrift_vec, ElementType, ReadThrift, ThriftCompactOutputProtocol, + ThriftSliceInputProtocol, + }; + use crate::schema::types::{ + num_nodes, parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor, TypePtr, + }; use std::sync::Arc; // for testing. decode thrift encoded RowGroup @@ -1547,6 +1563,40 @@ pub(crate) mod tests { convert_column(cc, column_descr) } + pub(crate) fn roundtrip_schema(schema: TypePtr) -> Result { + let num_nodes = num_nodes(&schema)?; + let mut buf = Vec::new(); + let mut writer = ThriftCompactOutputProtocol::new(&mut buf); + + // kick off writing list + writer.write_list_begin(ElementType::Struct, num_nodes)?; + + // write SchemaElements + write_schema(&schema, &mut writer)?; + + let mut prot = ThriftSliceInputProtocol::new(&mut buf); + let se: Vec = read_thrift_vec(&mut prot)?; + parquet_schema_from_array(se) + } + + pub(crate) fn schema_to_buf(schema: &TypePtr) -> Result> { + let num_nodes = num_nodes(&schema)?; + let mut buf = Vec::new(); + let mut writer = ThriftCompactOutputProtocol::new(&mut buf); + + // kick off writing list + writer.write_list_begin(ElementType::Struct, num_nodes)?; + + // write SchemaElements + write_schema(schema, &mut writer)?; + Ok(buf) + } + + pub(crate) fn buf_to_schema_list<'a>(buf: &'a mut Vec) -> Result>> { + let mut prot = ThriftSliceInputProtocol::new(buf.as_mut_slice()); + read_thrift_vec(&mut prot) + } + #[test] fn test_bounding_box_roundtrip() { test_roundtrip(BoundingBox { diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index 3f81353e28dd..9b414430f0eb 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -27,7 +27,7 @@ use crate::parquet_thrift::{ }; use crate::thrift_struct; -// TODO: This should probably all be moved to thrift_gen +// TODO(ets): This should probably all be moved to thrift_gen thrift_struct!( /// PageEncodingStats for a column chunk and data page. pub struct PageEncodingStats { @@ -36,47 +36,3 @@ pub struct PageEncodingStats { 3: required i32 count; } ); - -/// Converts Thrift definition into `PageEncodingStats`. -pub fn try_from_thrift( - thrift_encoding_stats: &crate::format::PageEncodingStats, -) -> Result { - let page_type = PageType::try_from(thrift_encoding_stats.page_type)?; - let encoding = Encoding::try_from(thrift_encoding_stats.encoding)?; - let count = thrift_encoding_stats.count; - - Ok(PageEncodingStats { - page_type, - encoding, - count, - }) -} - -/// Converts `PageEncodingStats` into Thrift definition. -pub fn to_thrift(encoding_stats: &PageEncodingStats) -> crate::format::PageEncodingStats { - let page_type = crate::format::PageType::from(encoding_stats.page_type); - let encoding = crate::format::Encoding::from(encoding_stats.encoding); - let count = encoding_stats.count; - - crate::format::PageEncodingStats { - page_type, - encoding, - count, - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_page_encoding_stats_from_thrift() { - let stats = PageEncodingStats { - page_type: PageType::DATA_PAGE, - encoding: Encoding::PLAIN, - count: 1, - }; - - assert_eq!(try_from_thrift(&to_thrift(&stats)).unwrap(), stats); - } -} diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 30b58ce0acb3..6f04f39c98d5 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -44,26 +44,6 @@ pub struct PageLocation { } ); -impl From<&crate::format::PageLocation> for PageLocation { - fn from(value: &crate::format::PageLocation) -> Self { - Self { - offset: value.offset, - compressed_page_size: value.compressed_page_size, - first_row_index: value.first_row_index, - } - } -} - -impl From<&PageLocation> for crate::format::PageLocation { - fn from(value: &PageLocation) -> Self { - Self { - offset: value.offset, - compressed_page_size: value.compressed_page_size, - first_row_index: value.first_row_index, - } - } -} - thrift_struct!( /// [`OffsetIndex`] information for a column chunk. Contains offsets and sizes for each page /// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY columns. @@ -79,18 +59,6 @@ pub struct OffsetIndexMetaData { ); impl OffsetIndexMetaData { - /// Creates a new [`OffsetIndexMetaData`] from an [`OffsetIndex`]. - /// - /// [`OffsetIndex`]: crate::format::OffsetIndex - #[allow(dead_code)] - pub(crate) fn try_new(index: crate::format::OffsetIndex) -> Result { - let page_locations = index.page_locations.iter().map(|loc| loc.into()).collect(); - Ok(Self { - page_locations, - unencoded_byte_array_data_bytes: index.unencoded_byte_array_data_bytes, - }) - } - /// Vector of [`PageLocation`] objects, one per page in the chunk. pub fn page_locations(&self) -> &Vec { &self.page_locations diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index dca0f8417957..306a70a09f79 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1030,7 +1030,8 @@ impl SchemaDescriptor { pub fn new(tp: TypePtr) -> Self { const INIT_SCHEMA_DEPTH: usize = 16; assert!(tp.is_group(), "SchemaDescriptor should take a GroupType"); - let n_leaves = num_leaves(&tp); + // unwrap should be safe since we just asserted tp is a group + let n_leaves = num_leaves(&tp).unwrap(); let mut leaves = Vec::with_capacity(n_leaves); let mut leaf_to_base = Vec::with_capacity(n_leaves); let mut path = Vec::with_capacity(INIT_SCHEMA_DEPTH); @@ -1115,12 +1116,15 @@ impl SchemaDescriptor { } // walk tree and count nodes -pub(crate) fn num_nodes(tp: &TypePtr) -> usize { +pub(crate) fn num_nodes(tp: &TypePtr) -> Result { + if !tp.is_group() { + return Err(general_err!("Root schema must be Group type")); + } let mut n_nodes = 1usize; // count root for f in tp.get_fields().iter() { count_nodes(f, &mut n_nodes); } - n_nodes + Ok(n_nodes) } pub(crate) fn count_nodes(tp: &TypePtr, n_nodes: &mut usize) { @@ -1133,12 +1137,15 @@ pub(crate) fn count_nodes(tp: &TypePtr, n_nodes: &mut usize) { } // do a quick walk of the tree to get proper sizing for SchemaDescriptor arrays -fn num_leaves(tp: &TypePtr) -> usize { +fn num_leaves(tp: &TypePtr) -> Result { + if !tp.is_group() { + return Err(general_err!("Root schema must be Group type")); + } let mut n_leaves = 0usize; for f in tp.get_fields().iter() { count_leaves(f, &mut n_leaves); } - n_leaves + Ok(n_leaves) } fn count_leaves(tp: &TypePtr, n_leaves: &mut usize) { @@ -1204,29 +1211,6 @@ fn build_tree<'a>( } } -/// Method to convert from Thrift. -pub fn from_thrift(elements: &[crate::format::SchemaElement]) -> Result { - let mut index = 0; - let mut schema_nodes = Vec::new(); - while index < elements.len() { - let t = from_thrift_helper(elements, index)?; - index = t.0; - schema_nodes.push(t.1); - } - if schema_nodes.len() != 1 { - return Err(general_err!( - "Expected exactly one root node, but found {}", - schema_nodes.len() - )); - } - - if !schema_nodes[0].is_group() { - return Err(general_err!("Expected root node to be a group type")); - } - - Ok(schema_nodes.remove(0)) -} - /// Checks if the logical type is valid. fn check_logical_type(logical_type: &Option) -> Result<()> { if let Some(LogicalType::Integer { bit_width, .. }) = *logical_type { @@ -1239,215 +1223,6 @@ fn check_logical_type(logical_type: &Option) -> Result<()> { Ok(()) } -/// Constructs a new Type from the `elements`, starting at index `index`. -/// The first result is the starting index for the next Type after this one. If it is -/// equal to `elements.len()`, then this Type is the last one. -/// The second result is the result Type. -fn from_thrift_helper( - elements: &[crate::format::SchemaElement], - index: usize, -) -> Result<(usize, TypePtr)> { - // Whether or not the current node is root (message type). - // There is only one message type node in the schema tree. - let is_root_node = index == 0; - - if index >= elements.len() { - return Err(general_err!( - "Index out of bound, index = {}, len = {}", - index, - elements.len() - )); - } - let element = &elements[index]; - - // Check for empty schema - if let (true, None | Some(0)) = (is_root_node, element.num_children) { - let builder = Type::group_type_builder(&element.name); - return Ok((index + 1, Arc::new(builder.build().unwrap()))); - } - - let converted_type = ConvertedType::try_from(element.converted_type)?; - // LogicalType is only present in v2 Parquet files. ConvertedType is always - // populated, regardless of the version of the file (v1 or v2). - let logical_type = element - .logical_type - .as_ref() - .map(|value| LogicalType::from(value.clone())); - - check_logical_type(&logical_type)?; - - let field_id = elements[index].field_id; - match elements[index].num_children { - // From parquet-format: - // The children count is used to construct the nested relationship. - // This field is not set when the element is a primitive type - // Sometimes parquet-cpp sets num_children field to 0 for primitive types, so we - // have to handle this case too. - None | Some(0) => { - // primitive type - if elements[index].repetition_type.is_none() { - return Err(general_err!( - "Repetition level must be defined for a primitive type" - )); - } - let repetition = Repetition::try_from(elements[index].repetition_type.unwrap())?; - if let Some(type_) = elements[index].type_ { - let physical_type = PhysicalType::try_from(type_)?; - let length = elements[index].type_length.unwrap_or(-1); - let scale = elements[index].scale.unwrap_or(-1); - let precision = elements[index].precision.unwrap_or(-1); - let name = &elements[index].name; - let builder = Type::primitive_type_builder(name, physical_type) - .with_repetition(repetition) - .with_converted_type(converted_type) - .with_logical_type(logical_type) - .with_length(length) - .with_precision(precision) - .with_scale(scale) - .with_id(field_id); - Ok((index + 1, Arc::new(builder.build()?))) - } else { - let mut builder = Type::group_type_builder(&elements[index].name) - .with_converted_type(converted_type) - .with_logical_type(logical_type) - .with_id(field_id); - if !is_root_node { - // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or - // REPEATED for root node. - // - // We only set repetition for group types that are not top-level message - // type. According to parquet-format: - // Root of the schema does not have a repetition_type. - // All other types must have one. - builder = builder.with_repetition(repetition); - } - Ok((index + 1, Arc::new(builder.build().unwrap()))) - } - } - Some(n) => { - let repetition = elements[index] - .repetition_type - .map(Repetition::try_from) - .transpose()?; - - let mut fields = Vec::with_capacity(n as usize); - let mut next_index = index + 1; - for _ in 0..n { - let child_result = from_thrift_helper(elements, next_index)?; - next_index = child_result.0; - fields.push(child_result.1); - } - - let mut builder = Type::group_type_builder(&elements[index].name) - .with_converted_type(converted_type) - .with_logical_type(logical_type) - .with_fields(fields) - .with_id(field_id); - if let Some(rep) = repetition { - // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or - // REPEATED for root node. - // - // We only set repetition for group types that are not top-level message - // type. According to parquet-format: - // Root of the schema does not have a repetition_type. - // All other types must have one. - if !is_root_node { - builder = builder.with_repetition(rep); - } - } - Ok((next_index, Arc::new(builder.build().unwrap()))) - } - } -} - -/// Method to convert to Thrift. -pub fn to_thrift(schema: &Type) -> Result> { - if !schema.is_group() { - return Err(general_err!("Root schema must be Group type")); - } - let mut elements: Vec = Vec::new(); - to_thrift_helper(schema, &mut elements); - Ok(elements) -} - -/// Constructs list of `SchemaElement` from the schema using depth-first traversal. -/// Here we assume that schema is always valid and starts with group type. -fn to_thrift_helper(schema: &Type, elements: &mut Vec) { - match *schema { - Type::PrimitiveType { - ref basic_info, - physical_type, - type_length, - scale, - precision, - } => { - let element = crate::format::SchemaElement { - type_: Some(physical_type.into()), - type_length: if type_length >= 0 { - Some(type_length) - } else { - None - }, - repetition_type: Some(basic_info.repetition().into()), - name: basic_info.name().to_owned(), - num_children: None, - converted_type: basic_info.converted_type().into(), - scale: if scale >= 0 { Some(scale) } else { None }, - precision: if precision >= 0 { - Some(precision) - } else { - None - }, - field_id: if basic_info.has_id() { - Some(basic_info.id()) - } else { - None - }, - logical_type: basic_info.logical_type().map(|value| value.into()), - }; - - elements.push(element); - } - Type::GroupType { - ref basic_info, - ref fields, - } => { - let repetition = if basic_info.has_repetition() { - Some(basic_info.repetition().into()) - } else { - None - }; - - let element = crate::format::SchemaElement { - type_: None, - type_length: None, - repetition_type: repetition, - name: basic_info.name().to_owned(), - num_children: Some(fields.len() as i32), - converted_type: basic_info.converted_type().into(), - scale: None, - precision: None, - field_id: if basic_info.has_id() { - Some(basic_info.id()) - } else { - None - }, - logical_type: basic_info.logical_type().map(|value| value.into()), - }; - - elements.push(element); - - // Add child elements for a group - for field in fields { - to_thrift_helper(field, elements); - } - } - } -} - -// This is a copy of `from_thrift` above, but rather than `format::SchemaElement` it takes -// the `file::metadata::thrift_gen::SchemaElement<'a>`. - // convert thrift decoded array of `SchemaElement` into this crate's representation of // parquet types. this function consumes `elements`. pub(crate) fn parquet_schema_from_array<'a>(elements: Vec>) -> Result { @@ -1594,7 +1369,10 @@ fn schema_from_array_helper<'a>( mod tests { use super::*; - use crate::schema::parser::parse_message_type; + use crate::{ + file::metadata::thrift_gen::tests::{buf_to_schema_list, roundtrip_schema, schema_to_buf}, + schema::parser::parse_message_type, + }; // TODO: add tests for v2 types @@ -2393,7 +2171,8 @@ mod tests { let schema = Type::primitive_type_builder("col", PhysicalType::INT32) .build() .unwrap(); - let thrift_schema = to_thrift(&schema); + let schema = Arc::new(schema); + let thrift_schema = schema_to_buf(&schema); assert!(thrift_schema.is_err()); if let Err(e) = thrift_schema { assert_eq!( @@ -2453,8 +2232,7 @@ mod tests { } "; let expected_schema = parse_message_type(message_type).unwrap(); - let thrift_schema = to_thrift(&expected_schema).unwrap(); - let result_schema = from_thrift(&thrift_schema).unwrap(); + let result_schema = roundtrip_schema(Arc::new(expected_schema.clone())).unwrap(); assert_eq!(result_schema, Arc::new(expected_schema)); } @@ -2469,8 +2247,7 @@ mod tests { } "; let expected_schema = parse_message_type(message_type).unwrap(); - let thrift_schema = to_thrift(&expected_schema).unwrap(); - let result_schema = from_thrift(&thrift_schema).unwrap(); + let result_schema = roundtrip_schema(Arc::new(expected_schema.clone())).unwrap(); assert_eq!(result_schema, Arc::new(expected_schema)); } @@ -2490,8 +2267,10 @@ mod tests { } "; - let expected_schema = parse_message_type(message_type).unwrap(); - let mut thrift_schema = to_thrift(&expected_schema).unwrap(); + let expected_schema = Arc::new(parse_message_type(message_type).unwrap()); + let mut buf = schema_to_buf(&expected_schema).unwrap(); + let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap(); + // Change all of None to Some(0) for elem in &mut thrift_schema[..] { if elem.num_children.is_none() { @@ -2499,8 +2278,8 @@ mod tests { } } - let result_schema = from_thrift(&thrift_schema).unwrap(); - assert_eq!(result_schema, Arc::new(expected_schema)); + let result_schema = parquet_schema_from_array(thrift_schema).unwrap(); + assert_eq!(result_schema, expected_schema); } // Sometimes parquet-cpp sets repetition level for the root node, which is against @@ -2515,23 +2294,25 @@ mod tests { } "; - let expected_schema = parse_message_type(message_type).unwrap(); - let mut thrift_schema = to_thrift(&expected_schema).unwrap(); + let expected_schema = Arc::new(parse_message_type(message_type).unwrap()); + let mut buf = schema_to_buf(&expected_schema).unwrap(); + let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap(); thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into()); - let result_schema = from_thrift(&thrift_schema).unwrap(); - assert_eq!(result_schema, Arc::new(expected_schema)); + let result_schema = parquet_schema_from_array(thrift_schema).unwrap(); + assert_eq!(result_schema, expected_schema); } #[test] fn test_schema_from_thrift_group_has_no_child() { let message_type = "message schema {}"; - let expected_schema = parse_message_type(message_type).unwrap(); - let mut thrift_schema = to_thrift(&expected_schema).unwrap(); + let expected_schema = Arc::new(parse_message_type(message_type).unwrap()); + let mut buf = schema_to_buf(&expected_schema).unwrap(); + let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap(); thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into()); - let result_schema = from_thrift(&thrift_schema).unwrap(); - assert_eq!(result_schema, Arc::new(expected_schema)); + let result_schema = parquet_schema_from_array(thrift_schema).unwrap(); + assert_eq!(result_schema, expected_schema); } } diff --git a/parquet/tests/arrow_reader/io/mod.rs b/parquet/tests/arrow_reader/io/mod.rs index 051a61de5075..37137b888dfc 100644 --- a/parquet/tests/arrow_reader/io/mod.rs +++ b/parquet/tests/arrow_reader/io/mod.rs @@ -47,6 +47,7 @@ use parquet::arrow::arrow_reader::{ use parquet::arrow::{ArrowWriter, ProjectionMask}; use parquet::data_type::AsBytes; use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetOffsetIndex}; +use parquet::file::page_index::offset_index::PageLocation; use parquet::file::properties::WriterProperties; use parquet::file::FOOTER_SIZE; use parquet::schema::types::SchemaDescriptor; @@ -256,7 +257,7 @@ struct TestColumnChunk { dictionary_page_location: Option, /// The location of the data pages in the file - page_locations: Vec, + page_locations: Vec, } /// Information about the pages in a single row group @@ -294,16 +295,11 @@ impl TestRowGroups { let start_offset = start_offset as usize; let end_offset = start_offset + length as usize; - let page_locations = page_locations - .iter() - .map(parquet::format::PageLocation::from) - .collect(); - TestColumnChunk { name: column_name.clone(), location: start_offset..end_offset, dictionary_page_location, - page_locations, + page_locations: page_locations.clone(), } }) .map(|test_column_chunk| { From 39a9169ca959a1190c5d110ca17996c5b9fcb6f9 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 8 Sep 2025 13:51:40 -0700 Subject: [PATCH 074/126] remove format references from docs --- parquet/src/file/metadata/mod.rs | 13 +++++-------- parquet/src/file/metadata/writer.rs | 4 ++-- parquet/src/file/page_index/column_index.rs | 2 +- parquet/src/file/page_index/index_reader.rs | 4 ++-- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index b1ec7f8646b7..cf9f908e0023 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -17,9 +17,7 @@ //! Parquet metadata API //! -//! Most users should use these structures to interact with Parquet metadata. -//! The [crate::format] module contains lower level structures generated from the -//! Parquet thrift definition. +//! Users should use these structures to interact with Parquet metadata. //! //! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet //! file footer. @@ -67,7 +65,6 @@ //! with a more idiomatic API. Note that, confusingly, some but not all //! of these structures have the same name as the [`format`] structures. //! -//! [`format`]: crate::format //! [`file::metadata`]: crate::file::metadata //! [parquet.thrift]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift //! @@ -148,7 +145,7 @@ pub(crate) use writer::ThriftMetadataWriter; /// column in the third row group of the parquet file. /// /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -/// [`ColumnIndex`]: crate::format::ColumnIndex +/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub type ParquetColumnIndex = Vec>; /// [`OffsetIndexMetaData`] for each data page of each row group of each column @@ -161,7 +158,7 @@ pub type ParquetColumnIndex = Vec>; /// `column_number`of row group `row_group_number`. /// /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -/// [`OffsetIndex`]: crate::format::OffsetIndex +/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub type ParquetOffsetIndex = Vec>; /// Parsed metadata for a single Parquet file @@ -1274,7 +1271,7 @@ impl ColumnChunkMetaDataBuilder { /// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex] /// /// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -/// [`ColumnIndex`]: crate::format::ColumnIndex +/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub struct ColumnIndexBuilder { column_type: Type, null_pages: Vec, @@ -1329,7 +1326,7 @@ impl ColumnIndexBuilder { /// Append the given page-level histograms to the [`ColumnIndex`] histograms. /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state. /// - /// [`ColumnIndex`]: crate::format::ColumnIndex + /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn append_histograms( &mut self, repetition_level_histogram: &Option, diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 031223959509..4699a20d8538 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -317,7 +317,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { /// 4. Length of encoded `FileMetaData` (4 bytes, little endian) /// 5. Parquet Magic Bytes (4 bytes) /// -/// [`FileMetaData`]: crate::format::FileMetaData +/// [`FileMetaData`]: https://github.com/apache/parquet-format/tree/master?tab=readme-ov-file#metadata /// [`ColumnChunkMetaData`]: crate::file::metadata::ColumnChunkMetaData /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md /// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md @@ -543,7 +543,7 @@ impl MetadataObjectWriter { /// Write [`FileMetaData`] in Thrift format, possibly encrypting it if required /// - /// [`FileMetaData`]: crate::format::FileMetaData + /// [`FileMetaData`]: https://github.com/apache/parquet-format/tree/master?tab=readme-ov-file#metadata fn write_file_metadata(&self, file_metadata: &FileMeta, mut sink: impl Write) -> Result<()> { match self.file_encryptor.as_ref() { Some(file_encryptor) if file_encryptor.properties().encrypt_footer() => { diff --git a/parquet/src/file/page_index/column_index.rs b/parquet/src/file/page_index/column_index.rs index a0893cc9eae9..6b5e9eb946a1 100644 --- a/parquet/src/file/page_index/column_index.rs +++ b/parquet/src/file/page_index/column_index.rs @@ -17,7 +17,7 @@ //! [`ColumnIndexMetaData`] structures holding decoded [`ColumnIndex`] information //! -//! [`ColumnIndex`]: crate::format::ColumnIndex +//! [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md //! use crate::{ diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index fbf97ad92cce..fd10b9fe8b3c 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -54,7 +54,7 @@ pub(crate) fn acc_range(a: Option>, b: Option>) -> Option< /// See [Page Index Documentation] for more details. /// /// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -/// [`ColumnIndex`]: crate::format::ColumnIndex +/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md #[deprecated( since = "55.2.0", note = "Use ParquetMetaDataReader instead; will be removed in 58.0.0" @@ -100,7 +100,7 @@ pub fn read_columns_indexes( /// See [Page Index Documentation] for more details. /// /// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -/// [`OffsetIndex`]: crate::format::OffsetIndex +/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md #[deprecated( since = "55.2.0", note = "Use ParquetMetaDataReader instead; will be removed in 58.0.0" From 8de96ce605c3b1a4db5a9de6af156e6e9ecff6e7 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 8 Sep 2025 15:47:18 -0700 Subject: [PATCH 075/126] remove format conversion functions --- parquet/src/basic.rs | 674 +----------------------- parquet/src/bin/parquet-layout.rs | 32 +- parquet/src/bloom_filter/mod.rs | 62 +-- parquet/src/file/metadata/thrift_gen.rs | 4 +- parquet/src/parquet_macros.rs | 39 -- parquet/src/schema/types.rs | 4 +- parquet/src/thrift.rs | 348 ++---------- 7 files changed, 90 insertions(+), 1073 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index ab058388413f..5318016c3589 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -37,7 +37,7 @@ use crate::errors::{ParquetError, Result}; // Types from the Thrift definition // ---------------------------------------------------------------------- -// Mirrors thrift enum `crate::format::Type` +// Mirrors thrift enum `Type` thrift_enum!( /// Types supported by Parquet. @@ -59,7 +59,7 @@ enum Type { ); // ---------------------------------------------------------------------- -// Mirrors thrift enum `crate::format::ConvertedType` +// Mirrors thrift enum `ConvertedType` // // Cannot use macros because of added field `None` @@ -219,7 +219,7 @@ impl WriteThriftField for ConvertedType { } // ---------------------------------------------------------------------- -// Mirrors thrift union `crate::format::TimeUnit` +// Mirrors thrift union `TimeUnit` thrift_union_all_empty!( /// Time unit for `Time` and `Timestamp` logical types. @@ -231,7 +231,7 @@ union TimeUnit { ); // ---------------------------------------------------------------------- -// Mirrors thrift union `crate::format::LogicalType` +// Mirrors thrift union `LogicalType` // private structs for decoding logical type @@ -584,7 +584,7 @@ impl WriteThriftField for LogicalType { } // ---------------------------------------------------------------------- -// Mirrors thrift enum `crate::format::FieldRepetitionType` +// Mirrors thrift enum `FieldRepetitionType` // // Cannot use macro since the name is changed @@ -604,7 +604,7 @@ enum FieldRepetitionType { pub type Repetition = FieldRepetitionType; // ---------------------------------------------------------------------- -// Mirrors thrift enum `crate::format::Encoding` +// Mirrors thrift enum `Encoding` thrift_enum!( /// Encodings supported by Parquet. @@ -713,7 +713,7 @@ impl FromStr for Encoding { } // ---------------------------------------------------------------------- -// Mirrors thrift enum `crate::format::CompressionCodec` +// Mirrors thrift enum `CompressionCodec` /// Supported block compression algorithms. /// @@ -892,7 +892,7 @@ impl FromStr for Compression { } // ---------------------------------------------------------------------- -// Mirrors thrift enum `crate::format::PageType` +// Mirrors thrift enum `PageType` thrift_enum!( /// Available data pages for Parquet file format. @@ -906,7 +906,7 @@ enum PageType { ); // ---------------------------------------------------------------------- -// Mirrors thrift enum `crate::format::BoundaryOrder` +// Mirrors thrift enum `BoundaryOrder` thrift_enum!( /// Enum to annotate whether lists of min/max elements inside ColumnIndex @@ -919,7 +919,7 @@ enum BoundaryOrder { ); // ---------------------------------------------------------------------- -// Mirrors thrift enum `crate::format::EdgeInterpolationAlgorithm` +// Mirrors thrift enum `EdgeInterpolationAlgorithm` thrift_enum!( /// Edge interpolation algorithm for Geography logical type @@ -933,7 +933,7 @@ enum EdgeInterpolationAlgorithm { ); // ---------------------------------------------------------------------- -// Mirrors thrift union `crate::format::BloomFilterAlgorithm` +// Mirrors thrift union `BloomFilterAlgorithm` thrift_union_all_empty!( /// The algorithm used in Bloom filter. @@ -944,7 +944,7 @@ union BloomFilterAlgorithm { ); // ---------------------------------------------------------------------- -// Mirrors thrift union `crate::format::BloomFilterHash` +// Mirrors thrift union `BloomFilterHash` thrift_union_all_empty!( /// The hash function used in Bloom filter. This function takes the hash of a column value @@ -956,7 +956,7 @@ union BloomFilterHash { ); // ---------------------------------------------------------------------- -// Mirrors thrift union `crate::format::BloomFilterCompression` +// Mirrors thrift union `BloomFilterCompression` thrift_union_all_empty!( /// The compression used in the Bloom filter. @@ -966,7 +966,7 @@ union BloomFilterCompression { ); // ---------------------------------------------------------------------- -// Mirrors thrift union `crate::format::ColumnOrder` +// Mirrors thrift union `ColumnOrder` /// Sort order for page and column statistics. /// @@ -1187,181 +1187,6 @@ impl fmt::Display for ColumnOrder { } } -// ---------------------------------------------------------------------- -// crate::format::ConvertedType <=> ConvertedType conversion - -impl TryFrom> for ConvertedType { - type Error = ParquetError; - - fn try_from(option: Option) -> Result { - Ok(match option { - None => ConvertedType::NONE, - Some(value) => match value { - crate::format::ConvertedType::UTF8 => ConvertedType::UTF8, - crate::format::ConvertedType::MAP => ConvertedType::MAP, - crate::format::ConvertedType::MAP_KEY_VALUE => ConvertedType::MAP_KEY_VALUE, - crate::format::ConvertedType::LIST => ConvertedType::LIST, - crate::format::ConvertedType::ENUM => ConvertedType::ENUM, - crate::format::ConvertedType::DECIMAL => ConvertedType::DECIMAL, - crate::format::ConvertedType::DATE => ConvertedType::DATE, - crate::format::ConvertedType::TIME_MILLIS => ConvertedType::TIME_MILLIS, - crate::format::ConvertedType::TIME_MICROS => ConvertedType::TIME_MICROS, - crate::format::ConvertedType::TIMESTAMP_MILLIS => ConvertedType::TIMESTAMP_MILLIS, - crate::format::ConvertedType::TIMESTAMP_MICROS => ConvertedType::TIMESTAMP_MICROS, - crate::format::ConvertedType::UINT_8 => ConvertedType::UINT_8, - crate::format::ConvertedType::UINT_16 => ConvertedType::UINT_16, - crate::format::ConvertedType::UINT_32 => ConvertedType::UINT_32, - crate::format::ConvertedType::UINT_64 => ConvertedType::UINT_64, - crate::format::ConvertedType::INT_8 => ConvertedType::INT_8, - crate::format::ConvertedType::INT_16 => ConvertedType::INT_16, - crate::format::ConvertedType::INT_32 => ConvertedType::INT_32, - crate::format::ConvertedType::INT_64 => ConvertedType::INT_64, - crate::format::ConvertedType::JSON => ConvertedType::JSON, - crate::format::ConvertedType::BSON => ConvertedType::BSON, - crate::format::ConvertedType::INTERVAL => ConvertedType::INTERVAL, - _ => { - return Err(general_err!( - "unexpected parquet converted type: {}", - value.0 - )) - } - }, - }) - } -} - -impl From for Option { - fn from(value: ConvertedType) -> Self { - match value { - ConvertedType::NONE => None, - ConvertedType::UTF8 => Some(crate::format::ConvertedType::UTF8), - ConvertedType::MAP => Some(crate::format::ConvertedType::MAP), - ConvertedType::MAP_KEY_VALUE => Some(crate::format::ConvertedType::MAP_KEY_VALUE), - ConvertedType::LIST => Some(crate::format::ConvertedType::LIST), - ConvertedType::ENUM => Some(crate::format::ConvertedType::ENUM), - ConvertedType::DECIMAL => Some(crate::format::ConvertedType::DECIMAL), - ConvertedType::DATE => Some(crate::format::ConvertedType::DATE), - ConvertedType::TIME_MILLIS => Some(crate::format::ConvertedType::TIME_MILLIS), - ConvertedType::TIME_MICROS => Some(crate::format::ConvertedType::TIME_MICROS), - ConvertedType::TIMESTAMP_MILLIS => Some(crate::format::ConvertedType::TIMESTAMP_MILLIS), - ConvertedType::TIMESTAMP_MICROS => Some(crate::format::ConvertedType::TIMESTAMP_MICROS), - ConvertedType::UINT_8 => Some(crate::format::ConvertedType::UINT_8), - ConvertedType::UINT_16 => Some(crate::format::ConvertedType::UINT_16), - ConvertedType::UINT_32 => Some(crate::format::ConvertedType::UINT_32), - ConvertedType::UINT_64 => Some(crate::format::ConvertedType::UINT_64), - ConvertedType::INT_8 => Some(crate::format::ConvertedType::INT_8), - ConvertedType::INT_16 => Some(crate::format::ConvertedType::INT_16), - ConvertedType::INT_32 => Some(crate::format::ConvertedType::INT_32), - ConvertedType::INT_64 => Some(crate::format::ConvertedType::INT_64), - ConvertedType::JSON => Some(crate::format::ConvertedType::JSON), - ConvertedType::BSON => Some(crate::format::ConvertedType::BSON), - ConvertedType::INTERVAL => Some(crate::format::ConvertedType::INTERVAL), - } - } -} - -// ---------------------------------------------------------------------- -// crate::format::LogicalType <=> LogicalType conversion - -impl From for LogicalType { - fn from(value: crate::format::LogicalType) -> Self { - match value { - crate::format::LogicalType::STRING(_) => LogicalType::String, - crate::format::LogicalType::MAP(_) => LogicalType::Map, - crate::format::LogicalType::LIST(_) => LogicalType::List, - crate::format::LogicalType::ENUM(_) => LogicalType::Enum, - crate::format::LogicalType::DECIMAL(t) => LogicalType::Decimal { - scale: t.scale, - precision: t.precision, - }, - crate::format::LogicalType::DATE(_) => LogicalType::Date, - crate::format::LogicalType::TIME(t) => LogicalType::Time { - is_adjusted_to_u_t_c: t.is_adjusted_to_u_t_c, - unit: t.unit.into(), - }, - crate::format::LogicalType::TIMESTAMP(t) => LogicalType::Timestamp { - is_adjusted_to_u_t_c: t.is_adjusted_to_u_t_c, - unit: t.unit.into(), - }, - crate::format::LogicalType::INTEGER(t) => LogicalType::Integer { - bit_width: t.bit_width, - is_signed: t.is_signed, - }, - crate::format::LogicalType::UNKNOWN(_) => LogicalType::Unknown, - crate::format::LogicalType::JSON(_) => LogicalType::Json, - crate::format::LogicalType::BSON(_) => LogicalType::Bson, - crate::format::LogicalType::UUID(_) => LogicalType::Uuid, - crate::format::LogicalType::FLOAT16(_) => LogicalType::Float16, - crate::format::LogicalType::VARIANT(vt) => LogicalType::Variant { - specification_version: vt.specification_version, - }, - crate::format::LogicalType::GEOMETRY(gt) => LogicalType::Geometry { crs: gt.crs }, - crate::format::LogicalType::GEOGRAPHY(gt) => LogicalType::Geography { - crs: gt.crs, - algorithm: gt.algorithm.map(|a| a.try_into().unwrap()), - }, - } - } -} - -impl From for crate::format::LogicalType { - fn from(value: LogicalType) -> Self { - match value { - LogicalType::String => crate::format::LogicalType::STRING(Default::default()), - LogicalType::Map => crate::format::LogicalType::MAP(Default::default()), - LogicalType::List => crate::format::LogicalType::LIST(Default::default()), - LogicalType::Enum => crate::format::LogicalType::ENUM(Default::default()), - LogicalType::Decimal { scale, precision } => { - crate::format::LogicalType::DECIMAL(crate::format::DecimalType { scale, precision }) - } - LogicalType::Date => crate::format::LogicalType::DATE(Default::default()), - LogicalType::Time { - is_adjusted_to_u_t_c, - unit, - } => crate::format::LogicalType::TIME(crate::format::TimeType { - is_adjusted_to_u_t_c, - unit: unit.into(), - }), - LogicalType::Timestamp { - is_adjusted_to_u_t_c, - unit, - } => crate::format::LogicalType::TIMESTAMP(crate::format::TimestampType { - is_adjusted_to_u_t_c, - unit: unit.into(), - }), - LogicalType::Integer { - bit_width, - is_signed, - } => crate::format::LogicalType::INTEGER(crate::format::IntType { - bit_width, - is_signed, - }), - LogicalType::Unknown => crate::format::LogicalType::UNKNOWN(Default::default()), - LogicalType::Json => crate::format::LogicalType::JSON(Default::default()), - LogicalType::Bson => crate::format::LogicalType::BSON(Default::default()), - LogicalType::Uuid => crate::format::LogicalType::UUID(Default::default()), - LogicalType::Float16 => crate::format::LogicalType::FLOAT16(Default::default()), - LogicalType::Variant { - specification_version, - } => crate::format::LogicalType::VARIANT(crate::format::VariantType { - specification_version, - }), - LogicalType::Geometry { crs } => { - crate::format::LogicalType::GEOMETRY(crate::format::GeometryType { crs }) - } - LogicalType::Geography { crs, algorithm } => { - crate::format::LogicalType::GEOGRAPHY(crate::format::GeographyType { - crs, - algorithm: algorithm.map(|a| a.into()), - }) - } - LogicalType::_Unknown { .. } => { - panic!("Trying to convert unknown LogicalType to thrift"); - } - } - } -} - // ---------------------------------------------------------------------- // LogicalType <=> ConvertedType conversion @@ -1589,57 +1414,6 @@ mod tests { ); } - #[test] - fn test_from_type() { - assert_eq!( - Type::try_from(crate::format::Type::BOOLEAN).unwrap(), - Type::BOOLEAN - ); - assert_eq!( - Type::try_from(crate::format::Type::INT32).unwrap(), - Type::INT32 - ); - assert_eq!( - Type::try_from(crate::format::Type::INT64).unwrap(), - Type::INT64 - ); - assert_eq!( - Type::try_from(crate::format::Type::INT96).unwrap(), - Type::INT96 - ); - assert_eq!( - Type::try_from(crate::format::Type::FLOAT).unwrap(), - Type::FLOAT - ); - assert_eq!( - Type::try_from(crate::format::Type::DOUBLE).unwrap(), - Type::DOUBLE - ); - assert_eq!( - Type::try_from(crate::format::Type::BYTE_ARRAY).unwrap(), - Type::BYTE_ARRAY - ); - assert_eq!( - Type::try_from(crate::format::Type::FIXED_LEN_BYTE_ARRAY).unwrap(), - Type::FIXED_LEN_BYTE_ARRAY - ); - } - - #[test] - fn test_into_type() { - assert_eq!(crate::format::Type::BOOLEAN, Type::BOOLEAN.into()); - assert_eq!(crate::format::Type::INT32, Type::INT32.into()); - assert_eq!(crate::format::Type::INT64, Type::INT64.into()); - assert_eq!(crate::format::Type::INT96, Type::INT96.into()); - assert_eq!(crate::format::Type::FLOAT, Type::FLOAT.into()); - assert_eq!(crate::format::Type::DOUBLE, Type::DOUBLE.into()); - assert_eq!(crate::format::Type::BYTE_ARRAY, Type::BYTE_ARRAY.into()); - assert_eq!( - crate::format::Type::FIXED_LEN_BYTE_ARRAY, - Type::FIXED_LEN_BYTE_ARRAY.into() - ); - } - #[test] fn test_from_string_into_type() { assert_eq!( @@ -1741,205 +1515,6 @@ mod tests { assert_eq!(ConvertedType::DECIMAL.to_string(), "DECIMAL") } - #[test] - fn test_from_converted_type() { - let parquet_conv_none: Option = None; - assert_eq!( - ConvertedType::try_from(parquet_conv_none).unwrap(), - ConvertedType::NONE - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::UTF8)).unwrap(), - ConvertedType::UTF8 - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::MAP)).unwrap(), - ConvertedType::MAP - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::MAP_KEY_VALUE)).unwrap(), - ConvertedType::MAP_KEY_VALUE - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::LIST)).unwrap(), - ConvertedType::LIST - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::ENUM)).unwrap(), - ConvertedType::ENUM - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::DECIMAL)).unwrap(), - ConvertedType::DECIMAL - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::DATE)).unwrap(), - ConvertedType::DATE - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::TIME_MILLIS)).unwrap(), - ConvertedType::TIME_MILLIS - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::TIME_MICROS)).unwrap(), - ConvertedType::TIME_MICROS - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::TIMESTAMP_MILLIS)).unwrap(), - ConvertedType::TIMESTAMP_MILLIS - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::TIMESTAMP_MICROS)).unwrap(), - ConvertedType::TIMESTAMP_MICROS - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::UINT_8)).unwrap(), - ConvertedType::UINT_8 - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::UINT_16)).unwrap(), - ConvertedType::UINT_16 - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::UINT_32)).unwrap(), - ConvertedType::UINT_32 - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::UINT_64)).unwrap(), - ConvertedType::UINT_64 - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::INT_8)).unwrap(), - ConvertedType::INT_8 - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::INT_16)).unwrap(), - ConvertedType::INT_16 - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::INT_32)).unwrap(), - ConvertedType::INT_32 - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::INT_64)).unwrap(), - ConvertedType::INT_64 - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::JSON)).unwrap(), - ConvertedType::JSON - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::BSON)).unwrap(), - ConvertedType::BSON - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::INTERVAL)).unwrap(), - ConvertedType::INTERVAL - ); - assert_eq!( - ConvertedType::try_from(Some(crate::format::ConvertedType::DECIMAL)).unwrap(), - ConvertedType::DECIMAL - ) - } - - #[test] - fn test_into_converted_type() { - let converted_type: Option = None; - assert_eq!(converted_type, ConvertedType::NONE.into()); - assert_eq!( - Some(crate::format::ConvertedType::UTF8), - ConvertedType::UTF8.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::MAP), - ConvertedType::MAP.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::MAP_KEY_VALUE), - ConvertedType::MAP_KEY_VALUE.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::LIST), - ConvertedType::LIST.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::ENUM), - ConvertedType::ENUM.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::DECIMAL), - ConvertedType::DECIMAL.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::DATE), - ConvertedType::DATE.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::TIME_MILLIS), - ConvertedType::TIME_MILLIS.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::TIME_MICROS), - ConvertedType::TIME_MICROS.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::TIMESTAMP_MILLIS), - ConvertedType::TIMESTAMP_MILLIS.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::TIMESTAMP_MICROS), - ConvertedType::TIMESTAMP_MICROS.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::UINT_8), - ConvertedType::UINT_8.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::UINT_16), - ConvertedType::UINT_16.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::UINT_32), - ConvertedType::UINT_32.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::UINT_64), - ConvertedType::UINT_64.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::INT_8), - ConvertedType::INT_8.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::INT_16), - ConvertedType::INT_16.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::INT_32), - ConvertedType::INT_32.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::INT_64), - ConvertedType::INT_64.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::JSON), - ConvertedType::JSON.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::BSON), - ConvertedType::BSON.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::INTERVAL), - ConvertedType::INTERVAL.into() - ); - assert_eq!( - Some(crate::format::ConvertedType::DECIMAL), - ConvertedType::DECIMAL.into() - ) - } - #[test] fn test_from_string_into_converted_type() { assert_eq!( @@ -2353,38 +1928,6 @@ mod tests { assert_eq!(Repetition::REPEATED.to_string(), "REPEATED"); } - #[test] - fn test_from_repetition() { - assert_eq!( - Repetition::try_from(crate::format::FieldRepetitionType::REQUIRED).unwrap(), - Repetition::REQUIRED - ); - assert_eq!( - Repetition::try_from(crate::format::FieldRepetitionType::OPTIONAL).unwrap(), - Repetition::OPTIONAL - ); - assert_eq!( - Repetition::try_from(crate::format::FieldRepetitionType::REPEATED).unwrap(), - Repetition::REPEATED - ); - } - - #[test] - fn test_into_repetition() { - assert_eq!( - crate::format::FieldRepetitionType::REQUIRED, - Repetition::REQUIRED.into() - ); - assert_eq!( - crate::format::FieldRepetitionType::OPTIONAL, - Repetition::OPTIONAL.into() - ); - assert_eq!( - crate::format::FieldRepetitionType::REPEATED, - Repetition::REPEATED.into() - ); - } - #[test] fn test_from_string_into_repetition() { assert_eq!( @@ -2428,64 +1971,6 @@ mod tests { assert_eq!(Encoding::RLE_DICTIONARY.to_string(), "RLE_DICTIONARY"); } - #[test] - fn test_from_encoding() { - assert_eq!( - Encoding::try_from(crate::format::Encoding::PLAIN).unwrap(), - Encoding::PLAIN - ); - assert_eq!( - Encoding::try_from(crate::format::Encoding::PLAIN_DICTIONARY).unwrap(), - Encoding::PLAIN_DICTIONARY - ); - assert_eq!( - Encoding::try_from(crate::format::Encoding::RLE).unwrap(), - Encoding::RLE - ); - assert_eq!( - Encoding::try_from(crate::format::Encoding::BIT_PACKED).unwrap(), - Encoding::BIT_PACKED - ); - assert_eq!( - Encoding::try_from(crate::format::Encoding::DELTA_BINARY_PACKED).unwrap(), - Encoding::DELTA_BINARY_PACKED - ); - assert_eq!( - Encoding::try_from(crate::format::Encoding::DELTA_LENGTH_BYTE_ARRAY).unwrap(), - Encoding::DELTA_LENGTH_BYTE_ARRAY - ); - assert_eq!( - Encoding::try_from(crate::format::Encoding::DELTA_BYTE_ARRAY).unwrap(), - Encoding::DELTA_BYTE_ARRAY - ); - } - - #[test] - fn test_into_encoding() { - assert_eq!(crate::format::Encoding::PLAIN, Encoding::PLAIN.into()); - assert_eq!( - crate::format::Encoding::PLAIN_DICTIONARY, - Encoding::PLAIN_DICTIONARY.into() - ); - assert_eq!(crate::format::Encoding::RLE, Encoding::RLE.into()); - assert_eq!( - crate::format::Encoding::BIT_PACKED, - Encoding::BIT_PACKED.into() - ); - assert_eq!( - crate::format::Encoding::DELTA_BINARY_PACKED, - Encoding::DELTA_BINARY_PACKED.into() - ); - assert_eq!( - crate::format::Encoding::DELTA_LENGTH_BYTE_ARRAY, - Encoding::DELTA_LENGTH_BYTE_ARRAY.into() - ); - assert_eq!( - crate::format::Encoding::DELTA_BYTE_ARRAY, - Encoding::DELTA_BYTE_ARRAY.into() - ); - } - #[test] fn test_compression_codec_to_string() { assert_eq!(Compression::UNCOMPRESSED.codec_to_string(), "UNCOMPRESSED"); @@ -2587,46 +2072,6 @@ mod tests { assert_eq!(PageType::DATA_PAGE_V2.to_string(), "DATA_PAGE_V2"); } - #[test] - fn test_from_page_type() { - assert_eq!( - PageType::try_from(crate::format::PageType::DATA_PAGE).unwrap(), - PageType::DATA_PAGE - ); - assert_eq!( - PageType::try_from(crate::format::PageType::INDEX_PAGE).unwrap(), - PageType::INDEX_PAGE - ); - assert_eq!( - PageType::try_from(crate::format::PageType::DICTIONARY_PAGE).unwrap(), - PageType::DICTIONARY_PAGE - ); - assert_eq!( - PageType::try_from(crate::format::PageType::DATA_PAGE_V2).unwrap(), - PageType::DATA_PAGE_V2 - ); - } - - #[test] - fn test_into_page_type() { - assert_eq!( - crate::format::PageType::DATA_PAGE, - PageType::DATA_PAGE.into() - ); - assert_eq!( - crate::format::PageType::INDEX_PAGE, - PageType::INDEX_PAGE.into() - ); - assert_eq!( - crate::format::PageType::DICTIONARY_PAGE, - PageType::DICTIONARY_PAGE.into() - ); - assert_eq!( - crate::format::PageType::DATA_PAGE_V2, - PageType::DATA_PAGE_V2.into() - ); - } - #[test] fn test_display_sort_order() { assert_eq!(SortOrder::SIGNED.to_string(), "SIGNED"); @@ -2945,38 +2390,6 @@ mod tests { assert_eq!(BoundaryOrder::UNORDERED.to_string(), "UNORDERED"); } - #[test] - fn test_from_boundary_order() { - assert_eq!( - BoundaryOrder::try_from(crate::format::BoundaryOrder::ASCENDING).unwrap(), - BoundaryOrder::ASCENDING - ); - assert_eq!( - BoundaryOrder::try_from(crate::format::BoundaryOrder::DESCENDING).unwrap(), - BoundaryOrder::DESCENDING - ); - assert_eq!( - BoundaryOrder::try_from(crate::format::BoundaryOrder::UNORDERED).unwrap(), - BoundaryOrder::UNORDERED - ); - } - - #[test] - fn test_into_boundary_order() { - assert_eq!( - crate::format::BoundaryOrder::ASCENDING, - BoundaryOrder::ASCENDING.into() - ); - assert_eq!( - crate::format::BoundaryOrder::DESCENDING, - BoundaryOrder::DESCENDING.into() - ); - assert_eq!( - crate::format::BoundaryOrder::UNORDERED, - BoundaryOrder::UNORDERED.into() - ); - } - #[test] fn test_display_edge_algo() { assert_eq!( @@ -2988,63 +2401,4 @@ mod tests { assert_eq!(EdgeInterpolationAlgorithm::ANDOYER.to_string(), "ANDOYER"); assert_eq!(EdgeInterpolationAlgorithm::KARNEY.to_string(), "KARNEY"); } - - #[test] - fn test_from_edge_algo() { - assert_eq!( - EdgeInterpolationAlgorithm::try_from( - crate::format::EdgeInterpolationAlgorithm::SPHERICAL - ) - .unwrap(), - EdgeInterpolationAlgorithm::SPHERICAL - ); - assert_eq!( - EdgeInterpolationAlgorithm::try_from( - crate::format::EdgeInterpolationAlgorithm::VINCENTY - ) - .unwrap(), - EdgeInterpolationAlgorithm::VINCENTY - ); - assert_eq!( - EdgeInterpolationAlgorithm::try_from(crate::format::EdgeInterpolationAlgorithm::THOMAS) - .unwrap(), - EdgeInterpolationAlgorithm::THOMAS - ); - assert_eq!( - EdgeInterpolationAlgorithm::try_from( - crate::format::EdgeInterpolationAlgorithm::ANDOYER - ) - .unwrap(), - EdgeInterpolationAlgorithm::ANDOYER - ); - assert_eq!( - EdgeInterpolationAlgorithm::try_from(crate::format::EdgeInterpolationAlgorithm::KARNEY) - .unwrap(), - EdgeInterpolationAlgorithm::KARNEY - ); - } - - #[test] - fn test_into_edge_algo() { - assert_eq!( - crate::format::EdgeInterpolationAlgorithm::SPHERICAL, - EdgeInterpolationAlgorithm::SPHERICAL.into() - ); - assert_eq!( - crate::format::EdgeInterpolationAlgorithm::VINCENTY, - EdgeInterpolationAlgorithm::VINCENTY.into() - ); - assert_eq!( - crate::format::EdgeInterpolationAlgorithm::THOMAS, - EdgeInterpolationAlgorithm::THOMAS.into() - ); - assert_eq!( - crate::format::EdgeInterpolationAlgorithm::ANDOYER, - EdgeInterpolationAlgorithm::ANDOYER.into() - ); - assert_eq!( - crate::format::EdgeInterpolationAlgorithm::KARNEY, - EdgeInterpolationAlgorithm::KARNEY.into() - ); - } } diff --git a/parquet/src/bin/parquet-layout.rs b/parquet/src/bin/parquet-layout.rs index 46a231a7d02b..6f589fab66ed 100644 --- a/parquet/src/bin/parquet-layout.rs +++ b/parquet/src/bin/parquet-layout.rs @@ -41,7 +41,7 @@ use parquet::file::metadata::ParquetMetaDataReader; use serde::Serialize; use thrift::protocol::TCompactInputProtocol; -use parquet::basic::{Compression, Encoding}; +use parquet::basic::Compression; use parquet::errors::Result; use parquet::file::reader::ChunkReader; use parquet::format::PageHeader; @@ -105,7 +105,7 @@ fn do_layout(reader: &C) -> Result { if let Some(dictionary) = header.dictionary_page_header { pages.push(Page { compression, - encoding: encoding(dictionary.encoding), + encoding: encoding(dictionary.encoding.0), page_type: "dictionary", offset: start, compressed_bytes: header.compressed_page_size, @@ -116,7 +116,7 @@ fn do_layout(reader: &C) -> Result { } else if let Some(data_page) = header.data_page_header { pages.push(Page { compression, - encoding: encoding(data_page.encoding), + encoding: encoding(data_page.encoding.0), page_type: "data_page_v1", offset: start, compressed_bytes: header.compressed_page_size, @@ -129,7 +129,7 @@ fn do_layout(reader: &C) -> Result { pages.push(Page { compression: compression.filter(|_| is_compressed), - encoding: encoding(data_page.encoding), + encoding: encoding(data_page.encoding.0), page_type: "data_page_v2", offset: start, compressed_bytes: header.compressed_page_size, @@ -196,19 +196,19 @@ fn compression(compression: Compression) -> Option<&'static str> { } /// Returns a string representation for a given encoding -fn encoding(encoding: parquet::format::Encoding) -> &'static str { - match Encoding::try_from(encoding) { - Ok(Encoding::PLAIN) => "plain", - Ok(Encoding::PLAIN_DICTIONARY) => "plain_dictionary", - Ok(Encoding::RLE) => "rle", +fn encoding(encoding: i32) -> &'static str { + match encoding { + 0 => "plain", + 2 => "plain_dictionary", + 3 => "rle", #[allow(deprecated)] - Ok(Encoding::BIT_PACKED) => "bit_packed", - Ok(Encoding::DELTA_BINARY_PACKED) => "delta_binary_packed", - Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY) => "delta_length_byte_array", - Ok(Encoding::DELTA_BYTE_ARRAY) => "delta_byte_array", - Ok(Encoding::RLE_DICTIONARY) => "rle_dictionary", - Ok(Encoding::BYTE_STREAM_SPLIT) => "byte_stream_split", - Err(_) => "unknown", + 4 => "bit_packed", + 5 => "delta_binary_packed", + 6 => "delta_length_byte_array", + 7 => "delta_byte_array", + 8 => "rle_dictionary", + 9 => "byte_stream_split", + _ => "unknown", } } diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index e1cc5fd68cd7..290a887b2960 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -74,13 +74,16 @@ use crate::basic::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash}; use crate::data_type::AsBytes; -use crate::errors::ParquetError; +use crate::errors::{ParquetError, Result}; use crate::file::metadata::ColumnChunkMetaData; use crate::file::reader::ChunkReader; -use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; +use crate::parquet_thrift::{ + ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, + ThriftSliceInputProtocol, WriteThrift, WriteThriftField, +}; +use crate::thrift_struct; use bytes::Bytes; use std::io::Write; -use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol}; use twox_hash::XxHash64; /// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach). @@ -95,42 +98,21 @@ const SALT: [u32; 8] = [ 0x5c6bfb31_u32, ]; +thrift_struct!( /// Bloom filter header is stored at beginning of Bloom filter data of each column /// and followed by its bitset. /// -#[derive(Clone, Debug, Eq, PartialEq)] pub struct BloomFilterHeader { - /// The size of bitset in bytes * - pub num_bytes: i32, - /// The algorithm for setting bits. * - pub algorithm: BloomFilterAlgorithm, - /// The hash function used for Bloom filter. * - pub hash: BloomFilterHash, - /// The compression used in the Bloom filter * - pub compression: BloomFilterCompression, -} - -impl From for BloomFilterHeader { - fn from(value: crate::format::BloomFilterHeader) -> Self { - Self { - num_bytes: value.num_bytes, - algorithm: value.algorithm.into(), - hash: value.hash.into(), - compression: value.compression.into(), - } - } -} - -impl From for crate::format::BloomFilterHeader { - fn from(value: BloomFilterHeader) -> Self { - Self { - num_bytes: value.num_bytes, - algorithm: value.algorithm.into(), - hash: value.hash.into(), - compression: value.compression.into(), - } - } + /// The size of bitset in bytes + 1: required i32 num_bytes; + /// The algorithm for setting bits. + 2: required BloomFilterAlgorithm algorithm; + /// The hash function used for Bloom filter + 3: required BloomFilterHash hash; + /// The compression used in the Bloom filter + 4: required BloomFilterCompression compression; } +); /// Each block is 256 bits, broken up into eight contiguous "words", each consisting of 32 bits. /// Each word is thought of as an array of bits; each bit is either "set" or "not set". @@ -235,10 +217,10 @@ pub(crate) fn read_bloom_filter_header_and_length( buffer: Bytes, ) -> Result<(BloomFilterHeader, u64), ParquetError> { let total_length = buffer.len(); - let mut prot = TCompactSliceInputProtocol::new(buffer.as_ref()); - let header = crate::format::BloomFilterHeader::read_from_in_protocol(&mut prot) + let mut prot = ThriftSliceInputProtocol::new(buffer.as_ref()); + let header = BloomFilterHeader::read_thrift(&mut prot) .map_err(|e| ParquetError::General(format!("Could not read bloom filter header: {e}")))?; - Ok((header.into(), (total_length - prot.as_slice().len()) as u64)) + Ok((header, (total_length - prot.as_slice().len()) as u64)) } pub(crate) const BITSET_MIN_LENGTH: usize = 32; @@ -302,12 +284,10 @@ impl Sbbf { /// flush the writer in order to boost performance of bulk writing all blocks. Caller /// must remember to flush the writer. pub(crate) fn write(&self, mut writer: W) -> Result<(), ParquetError> { - let mut protocol = TCompactOutputProtocol::new(&mut writer); - let header: crate::format::BloomFilterHeader = self.header().into(); - header.write_to_out_protocol(&mut protocol).map_err(|e| { + let mut protocol = ThriftCompactOutputProtocol::new(&mut writer); + self.header().write_thrift(&mut protocol).map_err(|e| { ParquetError::General(format!("Could not write bloom filter header: {e}")) })?; - protocol.flush()?; self.write_bitset(&mut writer)?; Ok(()) } diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 9fbfcbb0f45d..48aeb2df38dd 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -1574,13 +1574,13 @@ pub(crate) mod tests { // write SchemaElements write_schema(&schema, &mut writer)?; - let mut prot = ThriftSliceInputProtocol::new(&mut buf); + let mut prot = ThriftSliceInputProtocol::new(&buf); let se: Vec = read_thrift_vec(&mut prot)?; parquet_schema_from_array(se) } pub(crate) fn schema_to_buf(schema: &TypePtr) -> Result> { - let num_nodes = num_nodes(&schema)?; + let num_nodes = num_nodes(schema)?; let mut buf = Vec::new(); let mut writer = ThriftCompactOutputProtocol::new(&mut buf); diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 358b70a1b7ff..66d3692a25e2 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -65,28 +65,6 @@ macro_rules! thrift_enum { Ok(field_id) } } - - // TODO: remove when we finally get rid of the format module - impl TryFrom for $identifier { - type Error = ParquetError; - - #[allow(deprecated)] - fn try_from(value: crate::format::$identifier) -> Result { - Ok(match value { - $(crate::format::$identifier::$field_name => Self::$field_name,)* - _ => return Err(general_err!("Unexpected parquet {}: {}", stringify!($identifier), value.0)), - }) - } - } - - impl From<$identifier> for crate::format::$identifier { - #[allow(deprecated)] - fn from(value: $identifier) -> Self { - match value { - $($identifier::$field_name => Self::$field_name,)* - } - } - } } } @@ -149,23 +127,6 @@ macro_rules! thrift_union_all_empty { Ok(field_id) } } - - // TODO: remove when we finally get rid of the format module - impl From for $identifier { - fn from(value: crate::format::$identifier) -> Self { - match value { - $(crate::format::$identifier::$field_name(_) => Self::$field_name,)* - } - } - } - - impl From<$identifier> for crate::format::$identifier { - fn from(value: $identifier) -> Self { - match value { - $($identifier::$field_name => Self::$field_name(Default::default()),)* - } - } - } } } diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 306a70a09f79..44d9058abf11 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -2297,7 +2297,7 @@ mod tests { let expected_schema = Arc::new(parse_message_type(message_type).unwrap()); let mut buf = schema_to_buf(&expected_schema).unwrap(); let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap(); - thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into()); + thrift_schema[0].repetition_type = Some(Repetition::REQUIRED); let result_schema = parquet_schema_from_array(thrift_schema).unwrap(); assert_eq!(result_schema, expected_schema); @@ -2310,7 +2310,7 @@ mod tests { let expected_schema = Arc::new(parse_message_type(message_type).unwrap()); let mut buf = schema_to_buf(&expected_schema).unwrap(); let mut thrift_schema = buf_to_schema_list(&mut buf).unwrap(); - thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into()); + thrift_schema[0].repetition_type = Some(Repetition::REQUIRED); let result_schema = parquet_schema_from_array(thrift_schema).unwrap(); assert_eq!(result_schema, expected_schema); diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index 2492910a3115..ff9e9a39aac1 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -18,10 +18,7 @@ //! Custom thrift definitions pub use thrift::protocol::TCompactOutputProtocol; -use thrift::protocol::{ - TFieldIdentifier, TInputProtocol, TListIdentifier, TMapIdentifier, TMessageIdentifier, - TOutputProtocol, TSetIdentifier, TStructIdentifier, TType, -}; +use thrift::protocol::{TInputProtocol, TOutputProtocol}; /// Reads and writes the struct to Thrift protocols. /// @@ -54,332 +51,57 @@ pub fn bench_page_header_no_stats(bytes: &bytes::Bytes) { crate::file::metadata::thrift_gen::PageHeader::read_thrift_without_stats(&mut prot).unwrap(); } -/// A more performant implementation of [`TCompactInputProtocol`] that reads a slice -/// -/// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol -pub(crate) struct TCompactSliceInputProtocol<'a> { - buf: &'a [u8], - // Identifier of the last field deserialized for a struct. - last_read_field_id: i16, - // Stack of the last read field ids (a new entry is added each time a nested struct is read). - read_field_id_stack: Vec, - // Boolean value for a field. - // Saved because boolean fields and their value are encoded in a single byte, - // and reading the field only occurs after the field id is read. - pending_read_bool_value: Option, -} - -impl<'a> TCompactSliceInputProtocol<'a> { - pub fn new(buf: &'a [u8]) -> Self { - Self { - buf, - last_read_field_id: 0, - read_field_id_stack: Vec::with_capacity(16), - pending_read_bool_value: None, - } - } - - pub fn as_slice(&self) -> &'a [u8] { - self.buf - } - - fn read_vlq(&mut self) -> thrift::Result { - let mut in_progress = 0; - let mut shift = 0; - loop { - let byte = self.read_byte()?; - in_progress |= ((byte & 0x7F) as u64).wrapping_shl(shift); - shift += 7; - if byte & 0x80 == 0 { - return Ok(in_progress); - } - } - } - - fn read_zig_zag(&mut self) -> thrift::Result { - let val = self.read_vlq()?; - Ok((val >> 1) as i64 ^ -((val & 1) as i64)) - } - - fn read_list_set_begin(&mut self) -> thrift::Result<(TType, i32)> { - let header = self.read_byte()?; - let element_type = collection_u8_to_type(header & 0x0F)?; - - let possible_element_count = (header & 0xF0) >> 4; - let element_count = if possible_element_count != 15 { - // high bits set high if count and type encoded separately - possible_element_count as i32 - } else { - self.read_vlq()? as _ - }; - - Ok((element_type, element_count)) - } -} - -macro_rules! thrift_unimplemented { - () => { - Err(thrift::Error::Protocol(thrift::ProtocolError { - kind: thrift::ProtocolErrorKind::NotImplemented, - message: "not implemented".to_string(), - })) - }; -} - -impl TInputProtocol for TCompactSliceInputProtocol<'_> { - fn read_message_begin(&mut self) -> thrift::Result { - unimplemented!() - } - - fn read_message_end(&mut self) -> thrift::Result<()> { - thrift_unimplemented!() - } - - fn read_struct_begin(&mut self) -> thrift::Result> { - self.read_field_id_stack.push(self.last_read_field_id); - self.last_read_field_id = 0; - Ok(None) - } - - fn read_struct_end(&mut self) -> thrift::Result<()> { - self.last_read_field_id = self - .read_field_id_stack - .pop() - .expect("should have previous field ids"); - Ok(()) - } - - fn read_field_begin(&mut self) -> thrift::Result { - // we can read at least one byte, which is: - // - the type - // - the field delta and the type - let field_type = self.read_byte()?; - let field_delta = (field_type & 0xF0) >> 4; - let field_type = match field_type & 0x0F { - 0x01 => { - self.pending_read_bool_value = Some(true); - Ok(TType::Bool) - } - 0x02 => { - self.pending_read_bool_value = Some(false); - Ok(TType::Bool) - } - ttu8 => u8_to_type(ttu8), - }?; - - match field_type { - TType::Stop => Ok( - TFieldIdentifier::new::, String, Option>( - None, - TType::Stop, - None, - ), - ), - _ => { - if field_delta != 0 { - self.last_read_field_id = self - .last_read_field_id - .checked_add(field_delta as i16) - .map_or_else( - || { - Err(thrift::Error::Protocol(thrift::ProtocolError { - kind: thrift::ProtocolErrorKind::InvalidData, - message: format!( - "cannot add {} to {}", - field_delta, self.last_read_field_id - ), - })) - }, - Ok, - )?; - } else { - self.last_read_field_id = self.read_i16()?; - }; - - Ok(TFieldIdentifier { - name: None, - field_type, - id: Some(self.last_read_field_id), - }) - } - } - } - - fn read_field_end(&mut self) -> thrift::Result<()> { - Ok(()) - } - - fn read_bool(&mut self) -> thrift::Result { - match self.pending_read_bool_value.take() { - Some(b) => Ok(b), - None => { - let b = self.read_byte()?; - // Previous versions of the thrift specification said to use 0 and 1 inside collections, - // but that differed from existing implementations. - // The specification was updated in https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8. - // At least the go implementation seems to have followed the previously documented values. - match b { - 0x01 => Ok(true), - 0x00 | 0x02 => Ok(false), - unkn => Err(thrift::Error::Protocol(thrift::ProtocolError { - kind: thrift::ProtocolErrorKind::InvalidData, - message: format!("cannot convert {unkn} into bool"), - })), - } - } - } - } - - fn read_bytes(&mut self) -> thrift::Result> { - let len = self.read_vlq()? as usize; - let ret = self.buf.get(..len).ok_or_else(eof_error)?.to_vec(); - self.buf = &self.buf[len..]; - Ok(ret) - } - - fn read_i8(&mut self) -> thrift::Result { - Ok(self.read_byte()? as _) - } - - fn read_i16(&mut self) -> thrift::Result { - Ok(self.read_zig_zag()? as _) - } - - fn read_i32(&mut self) -> thrift::Result { - Ok(self.read_zig_zag()? as _) - } - - fn read_i64(&mut self) -> thrift::Result { - self.read_zig_zag() - } - - fn read_double(&mut self) -> thrift::Result { - let slice = (self.buf[..8]).try_into().unwrap(); - self.buf = &self.buf[8..]; - Ok(f64::from_le_bytes(slice)) - } - - fn read_string(&mut self) -> thrift::Result { - let bytes = self.read_bytes()?; - String::from_utf8(bytes).map_err(From::from) - } - - fn read_list_begin(&mut self) -> thrift::Result { - let (element_type, element_count) = self.read_list_set_begin()?; - Ok(TListIdentifier::new(element_type, element_count)) - } - - fn read_list_end(&mut self) -> thrift::Result<()> { - Ok(()) - } - - fn read_set_begin(&mut self) -> thrift::Result { - thrift_unimplemented!() - } - - fn read_set_end(&mut self) -> thrift::Result<()> { - thrift_unimplemented!() - } - - fn read_map_begin(&mut self) -> thrift::Result { - thrift_unimplemented!() - } - - fn read_map_end(&mut self) -> thrift::Result<()> { - Ok(()) - } - - #[inline] - fn read_byte(&mut self) -> thrift::Result { - let ret = *self.buf.first().ok_or_else(eof_error)?; - self.buf = &self.buf[1..]; - Ok(ret) - } -} - -fn collection_u8_to_type(b: u8) -> thrift::Result { - match b { - // For historical and compatibility reasons, a reader should be capable to deal with both cases. - // The only valid value in the original spec was 2, but due to an widespread implementation bug - // the defacto standard across large parts of the library became 1 instead. - // As a result, both values are now allowed. - // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set - 0x01 | 0x02 => Ok(TType::Bool), - o => u8_to_type(o), - } -} - -fn u8_to_type(b: u8) -> thrift::Result { - match b { - 0x00 => Ok(TType::Stop), - 0x03 => Ok(TType::I08), // equivalent to TType::Byte - 0x04 => Ok(TType::I16), - 0x05 => Ok(TType::I32), - 0x06 => Ok(TType::I64), - 0x07 => Ok(TType::Double), - 0x08 => Ok(TType::String), - 0x09 => Ok(TType::List), - 0x0A => Ok(TType::Set), - 0x0B => Ok(TType::Map), - 0x0C => Ok(TType::Struct), - unkn => Err(thrift::Error::Protocol(thrift::ProtocolError { - kind: thrift::ProtocolErrorKind::InvalidData, - message: format!("cannot convert {unkn} into TType"), - })), - } -} - -fn eof_error() -> thrift::Error { - thrift::Error::Transport(thrift::TransportError { - kind: thrift::TransportErrorKind::EndOfFile, - message: "Unexpected EOF".to_string(), - }) -} - #[cfg(test)] mod tests { - use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; + use crate::{ + basic::Type, + file::page_index::{column_index::ColumnIndexMetaData, index_reader::decode_column_index}, + }; #[test] pub fn read_boolean_list_field_type() { // Boolean collection type encoded as 0x01, as used by this crate when writing. // Values encoded as 1 (true) or 2 (false) as in the current version of the thrift // documentation. - let bytes = vec![0x19, 0x21, 2, 1, 0x19, 8, 0x19, 8, 0x15, 0, 0]; - - let mut protocol = TCompactSliceInputProtocol::new(bytes.as_slice()); - let index = crate::format::ColumnIndex::read_from_in_protocol(&mut protocol).unwrap(); - let expected = crate::format::ColumnIndex { - null_pages: vec![false, true], - min_values: vec![], - max_values: vec![], - boundary_order: crate::format::BoundaryOrder::UNORDERED, - null_counts: None, - repetition_level_histograms: None, - definition_level_histograms: None, + let bytes = vec![ + 0x19, 0x21, 2, 1, 0x19, 0x28, 1, 0, 0, 0x19, 0x28, 1, 1, 0, 0x15, 0, 0, + ]; + let index = decode_column_index(&bytes, Type::BOOLEAN).unwrap(); + + let index = match index { + ColumnIndexMetaData::BOOLEAN(index) => index, + _ => panic!("expected boolean column index"), }; - assert_eq!(&index, &expected); + // should be false, true + assert!(!index.is_null_page(0)); + assert!(index.is_null_page(1)); + assert!(!index.min_value(0).unwrap()); // min is false + assert!(index.max_value(0).unwrap()); // max is true + assert!(index.min_value(1).is_none()); + assert!(index.max_value(1).is_none()); } #[test] pub fn read_boolean_list_alternative_encoding() { // Boolean collection type encoded as 0x02, as allowed by the spec. // Values encoded as 1 (true) or 0 (false) as before the thrift documentation change on 2024-12-13. - let bytes = vec![0x19, 0x22, 0, 1, 0x19, 8, 0x19, 8, 0x15, 0, 0]; - - let mut protocol = TCompactSliceInputProtocol::new(bytes.as_slice()); - let index = crate::format::ColumnIndex::read_from_in_protocol(&mut protocol).unwrap(); - let expected = crate::format::ColumnIndex { - null_pages: vec![false, true], - min_values: vec![], - max_values: vec![], - boundary_order: crate::format::BoundaryOrder::UNORDERED, - null_counts: None, - repetition_level_histograms: None, - definition_level_histograms: None, + let bytes = vec![ + 0x19, 0x22, 0, 1, 0x19, 0x28, 1, 0, 0, 0x19, 0x28, 1, 1, 0, 0x15, 0, 0, + ]; + let index = decode_column_index(&bytes, Type::BOOLEAN).unwrap(); + + let index = match index { + ColumnIndexMetaData::BOOLEAN(index) => index, + _ => panic!("expected boolean column index"), }; - assert_eq!(&index, &expected); + // should be false, true + assert!(!index.is_null_page(0)); + assert!(index.is_null_page(1)); + assert!(!index.min_value(0).unwrap()); // min is false + assert!(index.max_value(0).unwrap()); // max is true + assert!(index.min_value(1).is_none()); + assert!(index.max_value(1).is_none()); } } From 683d4e4c1853b84e389f66887345b31ffa641fe8 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 8 Sep 2025 15:49:46 -0700 Subject: [PATCH 076/126] remove format::CompressionCodec --- parquet/src/basic.rs | 105 ------------------------------------------- 1 file changed, 105 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 5318016c3589..8e02bbdc2b60 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -1245,47 +1245,6 @@ impl From> for ConvertedType { } } -// ---------------------------------------------------------------------- -// crate::format::CompressionCodec <=> Compression conversion - -impl TryFrom for Compression { - type Error = ParquetError; - - fn try_from(value: crate::format::CompressionCodec) -> Result { - Ok(match value { - crate::format::CompressionCodec::UNCOMPRESSED => Compression::UNCOMPRESSED, - crate::format::CompressionCodec::SNAPPY => Compression::SNAPPY, - crate::format::CompressionCodec::GZIP => Compression::GZIP(Default::default()), - crate::format::CompressionCodec::LZO => Compression::LZO, - crate::format::CompressionCodec::BROTLI => Compression::BROTLI(Default::default()), - crate::format::CompressionCodec::LZ4 => Compression::LZ4, - crate::format::CompressionCodec::ZSTD => Compression::ZSTD(Default::default()), - crate::format::CompressionCodec::LZ4_RAW => Compression::LZ4_RAW, - _ => { - return Err(general_err!( - "unexpected parquet compression codec: {}", - value.0 - )) - } - }) - } -} - -impl From for crate::format::CompressionCodec { - fn from(value: Compression) -> Self { - match value { - Compression::UNCOMPRESSED => crate::format::CompressionCodec::UNCOMPRESSED, - Compression::SNAPPY => crate::format::CompressionCodec::SNAPPY, - Compression::GZIP(_) => crate::format::CompressionCodec::GZIP, - Compression::LZO => crate::format::CompressionCodec::LZO, - Compression::BROTLI(_) => crate::format::CompressionCodec::BROTLI, - Compression::LZ4 => crate::format::CompressionCodec::LZ4, - Compression::ZSTD(_) => crate::format::CompressionCodec::ZSTD, - Compression::LZ4_RAW => crate::format::CompressionCodec::LZ4_RAW, - } - } -} - // ---------------------------------------------------------------------- // String conversions for schema parsing. @@ -2000,70 +1959,6 @@ mod tests { ); } - #[test] - fn test_from_compression() { - assert_eq!( - Compression::try_from(crate::format::CompressionCodec::UNCOMPRESSED).unwrap(), - Compression::UNCOMPRESSED - ); - assert_eq!( - Compression::try_from(crate::format::CompressionCodec::SNAPPY).unwrap(), - Compression::SNAPPY - ); - assert_eq!( - Compression::try_from(crate::format::CompressionCodec::GZIP).unwrap(), - Compression::GZIP(Default::default()) - ); - assert_eq!( - Compression::try_from(crate::format::CompressionCodec::LZO).unwrap(), - Compression::LZO - ); - assert_eq!( - Compression::try_from(crate::format::CompressionCodec::BROTLI).unwrap(), - Compression::BROTLI(Default::default()) - ); - assert_eq!( - Compression::try_from(crate::format::CompressionCodec::LZ4).unwrap(), - Compression::LZ4 - ); - assert_eq!( - Compression::try_from(crate::format::CompressionCodec::ZSTD).unwrap(), - Compression::ZSTD(Default::default()) - ); - } - - #[test] - fn test_into_compression() { - assert_eq!( - crate::format::CompressionCodec::UNCOMPRESSED, - Compression::UNCOMPRESSED.into() - ); - assert_eq!( - crate::format::CompressionCodec::SNAPPY, - Compression::SNAPPY.into() - ); - assert_eq!( - crate::format::CompressionCodec::GZIP, - Compression::GZIP(Default::default()).into() - ); - assert_eq!( - crate::format::CompressionCodec::LZO, - Compression::LZO.into() - ); - assert_eq!( - crate::format::CompressionCodec::BROTLI, - Compression::BROTLI(Default::default()).into() - ); - assert_eq!( - crate::format::CompressionCodec::LZ4, - Compression::LZ4.into() - ); - assert_eq!( - crate::format::CompressionCodec::ZSTD, - Compression::ZSTD(Default::default()).into() - ); - } - #[test] fn test_display_page_type() { assert_eq!(PageType::DATA_PAGE.to_string(), "DATA_PAGE"); From e73a9229ad3fb25448c611091f34148b127f956a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 9 Sep 2025 07:25:03 -0700 Subject: [PATCH 077/126] remove format from statistics --- parquet/src/file/statistics.rs | 247 ++++----------------------------- parquet/src/file/writer.rs | 23 ++- 2 files changed, 45 insertions(+), 225 deletions(-) diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index e51f445b7e7e..1cd690f9353a 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -118,156 +118,6 @@ macro_rules! statistics_enum_func { }}; } -// FIXME(ets): remove this when done with format changes -/// Converts Thrift definition into `Statistics`. -pub fn from_thrift( - physical_type: Type, - thrift_stats: Option, -) -> Result> { - Ok(match thrift_stats { - Some(stats) => { - // Number of nulls recorded, when it is not available, we just mark it as 0. - // TODO this should be `None` if there is no information about NULLS. - // see https://github.com/apache/arrow-rs/pull/6216/files - let null_count = stats.null_count.unwrap_or(0); - - if null_count < 0 { - return Err(ParquetError::General(format!( - "Statistics null count is negative {null_count}", - ))); - } - - // Generic null count. - let null_count = Some(null_count as u64); - // Generic distinct count (count of distinct values occurring) - let distinct_count = stats.distinct_count.map(|value| value as u64); - // Whether or not statistics use deprecated min/max fields. - let old_format = stats.min_value.is_none() && stats.max_value.is_none(); - // Generic min value as bytes. - let min = if old_format { - stats.min - } else { - stats.min_value - }; - // Generic max value as bytes. - let max = if old_format { - stats.max - } else { - stats.max_value - }; - - fn check_len(min: &Option>, max: &Option>, len: usize) -> Result<()> { - if let Some(min) = min { - if min.len() < len { - return Err(ParquetError::General( - "Insufficient bytes to parse min statistic".to_string(), - )); - } - } - if let Some(max) = max { - if max.len() < len { - return Err(ParquetError::General( - "Insufficient bytes to parse max statistic".to_string(), - )); - } - } - Ok(()) - } - - match physical_type { - Type::BOOLEAN => check_len(&min, &max, 1), - Type::INT32 | Type::FLOAT => check_len(&min, &max, 4), - Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8), - Type::INT96 => check_len(&min, &max, 12), - _ => Ok(()), - }?; - - // Values are encoded using PLAIN encoding definition, except that - // variable-length byte arrays do not include a length prefix. - // - // Instead of using actual decoder, we manually convert values. - let res = match physical_type { - Type::BOOLEAN => Statistics::boolean( - min.map(|data| data[0] != 0), - max.map(|data| data[0] != 0), - distinct_count, - null_count, - old_format, - ), - Type::INT32 => Statistics::int32( - min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())), - max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())), - distinct_count, - null_count, - old_format, - ), - Type::INT64 => Statistics::int64( - min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())), - max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())), - distinct_count, - null_count, - old_format, - ), - Type::INT96 => { - // INT96 statistics may not be correct, because comparison is signed - let min = if let Some(data) = min { - assert_eq!(data.len(), 12); - Some(Int96::try_from_le_slice(&data)?) - } else { - None - }; - let max = if let Some(data) = max { - assert_eq!(data.len(), 12); - Some(Int96::try_from_le_slice(&data)?) - } else { - None - }; - Statistics::int96(min, max, distinct_count, null_count, old_format) - } - Type::FLOAT => Statistics::float( - min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())), - max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())), - distinct_count, - null_count, - old_format, - ), - Type::DOUBLE => Statistics::double( - min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())), - max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())), - distinct_count, - null_count, - old_format, - ), - Type::BYTE_ARRAY => Statistics::ByteArray( - ValueStatistics::new( - min.map(ByteArray::from), - max.map(ByteArray::from), - distinct_count, - null_count, - old_format, - ) - .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) - .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), - ), - Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray( - ValueStatistics::new( - min.map(ByteArray::from).map(FixedLenByteArray::from), - max.map(ByteArray::from).map(FixedLenByteArray::from), - distinct_count, - null_count, - old_format, - ) - .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) - .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), - ), - }; - - Some(res) - } - None => None, - }) -} - /// Converts Thrift definition into `Statistics`. pub(crate) fn from_thrift_page_stats( physical_type: Type, @@ -417,56 +267,6 @@ pub(crate) fn from_thrift_page_stats( }) } -// FIXME(ets): remove when done with format changes -/// Convert Statistics into Thrift definition. -pub fn to_thrift(stats: Option<&Statistics>) -> Option { - let stats = stats?; - - // record null count if it can fit in i64 - let null_count = stats - .null_count_opt() - .and_then(|value| i64::try_from(value).ok()); - - // record distinct count if it can fit in i64 - let distinct_count = stats - .distinct_count_opt() - .and_then(|value| i64::try_from(value).ok()); - - let mut thrift_stats = crate::format::Statistics { - max: None, - min: None, - null_count, - distinct_count, - max_value: None, - min_value: None, - is_max_value_exact: None, - is_min_value_exact: None, - }; - - // Get min/max if set. - let (min, max, min_exact, max_exact) = ( - stats.min_bytes_opt().map(|x| x.to_vec()), - stats.max_bytes_opt().map(|x| x.to_vec()), - Some(stats.min_is_exact()), - Some(stats.max_is_exact()), - ); - if stats.is_min_max_backwards_compatible() { - // Copy to deprecated min, max values for compatibility with older readers - thrift_stats.min.clone_from(&min); - thrift_stats.max.clone_from(&max); - } - - if !stats.is_min_max_deprecated() { - thrift_stats.min_value = min; - thrift_stats.max_value = max; - } - - thrift_stats.is_min_value_exact = min_exact; - thrift_stats.is_max_value_exact = max_exact; - - Some(thrift_stats) -} - /// Convert Statistics into Thrift definition. pub(crate) fn page_stats_to_thrift(stats: Option<&Statistics>) -> Option { let stats = stats?; @@ -518,15 +318,14 @@ pub(crate) fn page_stats_to_thrift(stats: Option<&Statistics>) -> Option null assert_eq!(thrift_stats.null_count, Some(100)); } @@ -1209,18 +1014,24 @@ mod tests { fn test_count_encoding_null_too_large() { // statistics are stored using i64, so test trying to store larger values let statistics = make_bool_stats(Some(100), Some(u64::MAX)); - let thrift_stats = to_thrift(Some(&statistics)).unwrap(); + let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap(); assert_eq!(thrift_stats.distinct_count, Some(100)); assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null } #[test] fn test_count_decoding_null_invalid() { - let tstatistics = crate::format::Statistics { + let tstatistics = PageStatistics { null_count: Some(-42), - ..Default::default() + max: None, + min: None, + distinct_count: None, + max_value: None, + min_value: None, + is_max_value_exact: None, + is_min_value_exact: None, }; - let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err(); + let err = from_thrift_page_stats(Type::BOOLEAN, Some(tstatistics)).unwrap_err(); assert_eq!( err.to_string(), "Parquet error: Statistics null count is negative -42" @@ -1233,14 +1044,14 @@ mod tests { fn statistics_count_test(distinct_count: Option, null_count: Option) { let statistics = make_bool_stats(distinct_count, null_count); - let thrift_stats = to_thrift(Some(&statistics)).unwrap(); + let thrift_stats = page_stats_to_thrift(Some(&statistics)).unwrap(); assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count); assert_eq!( thrift_stats.distinct_count.map(|c| c as u64), distinct_count ); - let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats)) + let round_tripped = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats)) .unwrap() .unwrap(); // TODO: remove branch when we no longer support assuming null_count==None in the thrift diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 99e0e7e7fb80..b0faf66f5036 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1027,11 +1027,11 @@ mod tests { use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::properties::EnabledStatistics; use crate::file::serialized_reader::ReadOptionsBuilder; - use crate::file::statistics::page_stats_to_thrift; + use crate::file::statistics::{from_thrift_page_stats, page_stats_to_thrift}; use crate::file::{ properties::{ReaderProperties, WriterProperties, WriterVersion}, reader::{FileReader, SerializedFileReader, SerializedPageReader}, - statistics::{from_thrift, to_thrift, Statistics}, + statistics::Statistics, }; use crate::record::{Row, RowAccessor}; use crate::schema::parser::parse_message_type; @@ -1482,8 +1482,11 @@ mod tests { encoding, def_level_encoding, rep_level_encoding, - statistics: from_thrift(physical_type, to_thrift(statistics.as_ref())) - .unwrap(), + statistics: from_thrift_page_stats( + physical_type, + page_stats_to_thrift(statistics.as_ref()), + ) + .unwrap(), } } Page::DataPageV2 { @@ -1512,8 +1515,11 @@ mod tests { def_levels_byte_len, rep_levels_byte_len, is_compressed: compressor.is_some(), - statistics: from_thrift(physical_type, to_thrift(statistics.as_ref())) - .unwrap(), + statistics: from_thrift_page_stats( + physical_type, + page_stats_to_thrift(statistics.as_ref()), + ) + .unwrap(), } } Page::DictionaryPage { @@ -1604,7 +1610,10 @@ mod tests { assert_eq!(&left.buffer(), &right.buffer()); assert_eq!(left.num_values(), right.num_values()); assert_eq!(left.encoding(), right.encoding()); - assert_eq!(to_thrift(left.statistics()), to_thrift(right.statistics())); + assert_eq!( + page_stats_to_thrift(left.statistics()), + page_stats_to_thrift(right.statistics()) + ); } /// Tests roundtrip of i32 data written using `W` and read using `R` From f81a732402bb6c8896ec204f15bf23a4943dfcf1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 10 Sep 2025 14:05:41 -0700 Subject: [PATCH 078/126] get a start on some documentation and add some TODOs --- parquet/src/basic.rs | 5 +- parquet/src/parquet_macros.rs | 4 ++ parquet/src/parquet_thrift.rs | 89 +++++++++++++++++++++++++++++++++-- 3 files changed, 93 insertions(+), 5 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 5fd49043731e..5fffb56cdf74 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -774,8 +774,9 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Compression { } } -// FIXME -// ugh...why did we add compression level to some variants if we don't use them???? +// TODO(ets): explore replacing this with a thrift_enum!(ThriftCompression) for the serialization +// and then provide `From` impls to convert back and forth. This is necessary due to the addition +// of compression level to some variants. impl WriteThrift for Compression { const ELEMENT_TYPE: ElementType = ElementType::I32; diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index ae1d772a07cb..eb523a6982a0 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -20,6 +20,10 @@ // They allow for pasting sections of the Parquet thrift IDL file // into a macro to generate rust structures and implementations. +// TODO(ets): These macros need a good bit of documentation so other developers will be able +// to use them correctly. Also need to write a .md file with complete examples of both how +// to use the macros, and how to implement custom readers and writers when necessary. + #[macro_export] #[allow(clippy::crate_in_macro_def)] /// macro to generate rust enums from a thrift enum definition diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 593aec4e0f2b..590e5d9e1eb7 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -549,24 +549,36 @@ where ///////////////////////// // thrift compact output +/// Low-level object used to serialize structs to the Thrift [compact output] protocol. +/// +/// This struct serves as a wrapper around a [`Write`] object, to which thrift encoded data +/// will written. The implementation provides functions to write Thrift primitive types, as well +/// as functions used in the encoding of lists and structs. This should rarely be used directly, +/// but is instead intended for use by implementers of [`WriteThrift`] and [`WriteThriftField`]. +/// +/// [compact output]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md pub(crate) struct ThriftCompactOutputProtocol { writer: W, } impl ThriftCompactOutputProtocol { + /// Create a new `ThriftCompactOutputProtocol` wrapping the byte sink `writer`. pub(crate) fn new(writer: W) -> Self { Self { writer } } + /// Return a reference to the underlying `Write`. pub(crate) fn inner(&self) -> &W { &self.writer } + /// Write a single byte to the output stream. fn write_byte(&mut self, b: u8) -> Result<()> { self.writer.write_all(&[b])?; Ok(()) } + /// Write the given `u64` as a ULEB128 encoded varint. fn write_vlq(&mut self, val: u64) -> Result<()> { let mut v = val; while v > 0x7f { @@ -576,11 +588,16 @@ impl ThriftCompactOutputProtocol { self.write_byte(v as u8) } + /// Write the given `i64` as a zig-zag encoded varint. fn write_zig_zag(&mut self, val: i64) -> Result<()> { let s = (val < 0) as i64; self.write_vlq((((val ^ -s) << 1) + s) as u64) } + /// Used to mark the start of a Thrift struct field of type `field_type`. `last_field_id` + /// is used to compute a delta to the given `field_id` per the compact protocol [spec]. + /// + /// [spec]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#struct-encoding pub(crate) fn write_field_begin( &mut self, field_type: FieldType, @@ -596,6 +613,7 @@ impl ThriftCompactOutputProtocol { } } + /// Used to indicate the start of a list of `element_type` elements. pub(crate) fn write_list_begin(&mut self, element_type: ElementType, len: usize) -> Result<()> { if len < 15 { self.write_byte((len as u8) << 4 | element_type as u8) @@ -605,22 +623,29 @@ impl ThriftCompactOutputProtocol { } } + /// Used to mark the end of a struct. This must be called after all fields of the struct have + /// been written. pub(crate) fn write_struct_end(&mut self) -> Result<()> { self.write_byte(0) } + /// Serialize a slice of `u8`s. This will encode a length, and then write the bytes without + /// further encoding. pub(crate) fn write_bytes(&mut self, val: &[u8]) -> Result<()> { self.write_vlq(val.len() as u64)?; self.writer.write_all(val)?; Ok(()) } + /// Short-cut method used to encode structs that have no fields (often used in Thrift unions). + /// This simply encodes the field id and then immediately writes the end-of-struct marker. pub(crate) fn write_empty_struct(&mut self, field_id: i16, last_field_id: i16) -> Result { self.write_field_begin(FieldType::Struct, field_id, last_field_id)?; self.write_struct_end()?; Ok(last_field_id) } + /// Write a boolean value. pub(crate) fn write_bool(&mut self, val: bool) -> Result<()> { match val { true => self.write_byte(1), @@ -628,35 +653,47 @@ impl ThriftCompactOutputProtocol { } } + /// Write a zig-zag encoded `i8` value. pub(crate) fn write_i8(&mut self, val: i8) -> Result<()> { self.write_byte(val as u8) } + /// Write a zig-zag encoded `i16` value. pub(crate) fn write_i16(&mut self, val: i16) -> Result<()> { self.write_zig_zag(val as _) } + /// Write a zig-zag encoded `i32` value. pub(crate) fn write_i32(&mut self, val: i32) -> Result<()> { self.write_zig_zag(val as _) } + /// Write a zig-zag encoded `i64` value. pub(crate) fn write_i64(&mut self, val: i64) -> Result<()> { self.write_zig_zag(val as _) } + /// Write a double value. pub(crate) fn write_double(&mut self, val: f64) -> Result<()> { self.writer.write_all(&val.to_le_bytes())?; Ok(()) } } +/// Trait implemented by objects that are to be serialized to a Thrift [compact output] protocol +/// stream. Implementations are also provided for primitive Thrift types. +/// +/// [compact output]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md pub(crate) trait WriteThrift { + /// The [`ElementType`] to use when a list of this object is written. const ELEMENT_TYPE: ElementType; - // used to write generated enums and structs + /// Serialize this object to the given `writer`. fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()>; } +/// Implementation for a vector of thrift serializable objects that implement [`WriteThrift`]. +/// This will write the necessary list header and then serialize the elements one-at-a-time. impl WriteThrift for Vec where T: WriteThrift, @@ -744,9 +781,55 @@ impl WriteThrift for String { } } +/// Trait implemented by objects that are fields of Thrift structs. +/// +/// For example, given the Thrift struct definition +/// ``` +/// struct MyStruct { +/// 1: required i32 field1 +/// 2: optional bool field2 +/// 3: optional OtherStruct field3 +/// } +/// ``` +/// +/// which becomes in Rust +/// ```rust +/// struct MyStruct { +/// field1: i32, +/// field2: Option, +/// field3: Option, +/// } +/// ``` +/// the impl of `WriteThrift` for `MyStruct` will use the `WriteThriftField` impls for `i32`, +/// `bool`, and `OtherStruct`. +/// +/// ``` +/// impl WriteThrift for MyStruct { +/// const ELEMENT_TYPE: ElementType = ElementType::Double; +/// +/// fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { +/// let mut last_field_id = 0i16; +/// last_field_id = self.field1.write_thrift_field(writer, 1, last_field_id)?; +/// if self.field2.is_some() { +/// // if field2 is `None` then this assignment won't happen and last_field_id will remain +/// // `1` when writing `field3` +/// last_field_id = self.field2.write_thrift_field(writer, 2, last_field_id)?; +/// } +/// if self.field3.is_some() { +/// // no need to assign last_field_id since this is the final field. +/// self.field3.write_thrift_field(writer, 3, last_field_id)?; +/// } +/// } +/// } +/// ``` +/// pub(crate) trait WriteThriftField { - // used to write struct fields (which may be basic types or generated types). - // write the field header and field value. returns `field_id`. + /// Used to write struct fields (which may be primitive or IDL defined types). This will + /// write the field marker for the given `field_id`, using `last_field_id` to compute the + /// field delta used by the Thrift [compact protocol]. On success this will return `field_id` + /// to be used in chaining. + /// + /// [compact protocol]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#struct-encoding fn write_thrift_field( &self, writer: &mut ThriftCompactOutputProtocol, From 7268dd3343be9ab1070614aaa5d599e3dd3110d6 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 10 Sep 2025 14:39:09 -0700 Subject: [PATCH 079/126] fix docs --- parquet/src/parquet_thrift.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 590e5d9e1eb7..9b83c0a01b8d 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -784,7 +784,7 @@ impl WriteThrift for String { /// Trait implemented by objects that are fields of Thrift structs. /// /// For example, given the Thrift struct definition -/// ``` +/// ```ignore /// struct MyStruct { /// 1: required i32 field1 /// 2: optional bool field2 @@ -793,7 +793,8 @@ impl WriteThrift for String { /// ``` /// /// which becomes in Rust -/// ```rust +/// ```no_run +/// # struct OtherStruct {} /// struct MyStruct { /// field1: i32, /// field2: Option, @@ -803,10 +804,8 @@ impl WriteThrift for String { /// the impl of `WriteThrift` for `MyStruct` will use the `WriteThriftField` impls for `i32`, /// `bool`, and `OtherStruct`. /// -/// ``` +/// ```ignore /// impl WriteThrift for MyStruct { -/// const ELEMENT_TYPE: ElementType = ElementType::Double; -/// /// fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { /// let mut last_field_id = 0i16; /// last_field_id = self.field1.write_thrift_field(writer, 1, last_field_id)?; @@ -819,6 +818,7 @@ impl WriteThrift for String { /// // no need to assign last_field_id since this is the final field. /// self.field3.write_thrift_field(writer, 3, last_field_id)?; /// } +/// writer.write_struct_end() /// } /// } /// ``` From cfa674012cefad01070b7c9c03b4e4059496105b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 10 Sep 2025 15:38:11 -0700 Subject: [PATCH 080/126] backport fix for tests without encryption --- parquet/src/file/serialized_reader.rs | 6 ++++++ parquet/tests/arrow_reader/bad_data.rs | 3 +++ 2 files changed, 9 insertions(+) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 335f0bc3601b..728598045315 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1875,10 +1875,16 @@ mod tests { 80, 65, 82, 49, ]; let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data)); + #[cfg(feature = "encryption")] assert_eq!( ret.err().unwrap().to_string(), "Parquet error: Could not parse metadata: Parquet error: Received empty union from remote ColumnOrder" ); + #[cfg(not(feature = "encryption"))] + assert_eq!( + ret.err().unwrap().to_string(), + "Parquet error: Received empty union from remote ColumnOrder" + ); } #[test] diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index 58e342ab39d1..ecf449a7ce61 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -80,10 +80,13 @@ fn test_invalid_files() { #[test] fn test_parquet_1481() { let err = read_file("PARQUET-1481.parquet").unwrap_err(); + #[cfg(feature = "encryption")] assert_eq!( err.to_string(), "Parquet error: Could not parse metadata: Parquet error: Unexpected Type -7" ); + #[cfg(not(feature = "encryption"))] + assert_eq!(err.to_string(), "Parquet error: Unexpected Type -7"); } #[test] From 82f31a41934d606015b31726a7b09bb61669c190 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 11 Sep 2025 07:47:44 -0700 Subject: [PATCH 081/126] add documentation --- parquet/src/parquet_thrift.rs | 89 +++++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 20 deletions(-) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index e37942f96207..17847d0b71e5 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -24,10 +24,9 @@ use std::{cmp::Ordering, io::Write}; use crate::errors::{ParquetError, Result}; -// Couldn't implement thrift structs with f64 do to lack of Eq -// for f64. This is a hacky workaround for now...there are other -// wrappers out there that should probably be used instead. -// thrift seems to re-export an impl from ordered-float +/// Wrapper for thrift `double` fields. This is used to provide +/// an implementation of `Eq` for floats. This implementation +/// uses IEEE 754 total order. #[derive(Debug, Clone, Copy, PartialEq)] pub struct OrderedF64(f64); @@ -156,25 +155,52 @@ impl TryFrom for ElementType { } } +/// Struct used to describe a [thrift struct] field during decoding. +/// +/// [thrift struct]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#struct-encoding pub(crate) struct FieldIdentifier { + /// The type for the field. pub(crate) field_type: FieldType, + /// The field's `id`. May be computed from delta or directly decoded. pub(crate) id: i16, + /// Stores the value for booleans. + /// + /// Boolean fields store no data, instead the field type is either boolean true, or + /// boolean false. pub(crate) bool_val: Option, } +/// Struct used to describe a [thrift list]. +/// +/// [thrift list]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct ListIdentifier { + /// The type for each element in the list. pub(crate) element_type: ElementType, + /// Number of elements contained in the list. pub(crate) size: i32, } +/// Low-level object used to deserialize structs encoded with the Thrift [compact] protocol. +/// +/// Implementation of this trait must provide the low-level functions `read_byte`, `read_bytes`, +/// `skip_bytes`, and `read_double`. These primitives are used by the default functions provided +/// here to perform deserialization. +/// +/// [compact]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md pub(crate) trait ThriftCompactInputProtocol<'a> { + /// Read a single byte from the input. fn read_byte(&mut self) -> Result; + /// Read a Thrift encoded [binary] from the input. + /// + /// [binary]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#binary-encoding fn read_bytes(&mut self) -> Result<&'a [u8]>; + /// Skip the next `n` bytes of input. fn skip_bytes(&mut self, n: usize) -> Result<()>; + /// Read a ULEB128 encoded unsigned varint from the input. fn read_vlq(&mut self) -> Result { let mut in_progress = 0; let mut shift = 0; @@ -188,11 +214,13 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { } } + /// Read a zig-zag encoded signed varint from the input. fn read_zig_zag(&mut self) -> Result { let val = self.read_vlq()?; Ok((val >> 1) as i64 ^ -((val & 1) as i64)) } + /// Read the [`ListIdentifier`] for a Thrift encoded list. fn read_list_begin(&mut self) -> Result { let header = self.read_byte()?; let element_type = ElementType::try_from(header & 0x0f)?; @@ -211,6 +239,7 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { }) } + /// Read the [`FieldIdentifier`] for a field in a Thrift encoded struct. fn read_field_begin(&mut self, last_field_id: i16) -> Result { // we can read at least one byte, which is: // - the type @@ -256,12 +285,12 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { } } - // This is a specialized version of read_field_begin, solely for use in parsing - // PageLocation structs in the offset index. This function assumes that the delta - // field will always be less than 0xf, fields will be in order, and no boolean fields - // will be read. This also skips validation of the field type. - // - // Returns a tuple of (field_type, field_delta) + /// This is a specialized version of [`Self::read_field_begin`], solely for use in parsing + /// simple structs. This function assumes that the delta field will always be less than 0xf, + /// fields will be in order, and no boolean fields will be read. + /// This also skips validation of the field type. + /// + /// Returns a tuple of `(field_type, field_delta)`. fn read_field_header(&mut self) -> Result<(u8, u8)> { let field_type = self.read_byte()?; let field_delta = (field_type & 0xf0) >> 4; @@ -269,7 +298,8 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { Ok((field_type, field_delta)) } - // not to be used for bool struct fields, just for bool arrays + /// Read a boolean list element. This should not be used for struct fields. For the latter, + /// use the [`FieldIdentifier::bool_val`] field. fn read_bool(&mut self) -> Result { let b = self.read_byte()?; // Previous versions of the thrift specification said to use 0 and 1 inside collections, @@ -283,29 +313,38 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { } } + /// Read a Thrift [binary] as a UTF-8 encoded string. + /// + /// [binary]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#binary-encoding fn read_string(&mut self) -> Result<&'a str> { let slice = self.read_bytes()?; Ok(std::str::from_utf8(slice)?) } + /// Read an `i8`. fn read_i8(&mut self) -> Result { Ok(self.read_byte()? as _) } + /// Read an `i16`. fn read_i16(&mut self) -> Result { Ok(self.read_zig_zag()? as _) } + /// Read an `i32`. fn read_i32(&mut self) -> Result { Ok(self.read_zig_zag()? as _) } + /// Read an `i64`. fn read_i64(&mut self) -> Result { self.read_zig_zag() } + /// Read a Thrift `double` as `f64`. fn read_double(&mut self) -> Result; + /// Skip a ULEB128 encoded varint. fn skip_vlq(&mut self) -> Result<()> { loop { let byte = self.read_byte()?; @@ -315,20 +354,24 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { } } + /// Skip a thrift [binary]. + /// + /// [binary]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#binary-encoding fn skip_binary(&mut self) -> Result<()> { let len = self.read_vlq()? as usize; self.skip_bytes(len) } /// Skip a field with type `field_type` recursively until the default - /// maximum skip depth is reached. + /// maximum skip depth (currently 64) is reached. fn skip(&mut self, field_type: FieldType) -> Result<()> { - // TODO: magic number - self.skip_till_depth(field_type, 64) + const DEFAULT_SKIP_DEPTH: i8 = 64; + self.skip_till_depth(field_type, DEFAULT_SKIP_DEPTH) } /// Empty structs in unions consist of a single byte of 0 for the field stop record. - /// This skips that byte without pushing to the field id stack. + /// This skips that byte without encuring the cost of processing the [`FieldIdentifier`]. + /// Will return an error if the struct is not actually empty. fn skip_empty_struct(&mut self) -> Result<()> { let b = self.read_byte()?; if b != 0 { @@ -379,19 +422,23 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { } } +/// A high performance Thrift reader that reads from a slice of bytes. pub(crate) struct ThriftSliceInputProtocol<'a> { buf: &'a [u8], } impl<'a> ThriftSliceInputProtocol<'a> { + /// Create a new `ThriftSliceInputProtocol` using the bytes in `buf`. pub fn new(buf: &'a [u8]) -> Self { Self { buf } } + /// Re-initialize this reader with a new slice. pub fn reset_buffer(&mut self, buf: &'a [u8]) { self.buf = buf; } + /// Return the current buffer as a slice. pub fn as_slice(&self) -> &'a [u8] { self.buf } @@ -433,8 +480,10 @@ fn eof_error() -> ParquetError { eof_err!("Unexpected EOF") } +/// Trait implemented for objects that can be deserialized from a Thrift input stream. +/// Implementations are provided for Thrift primitive types. pub(crate) trait ReadThrift<'a, R: ThriftCompactInputProtocol<'a>> { - // used to read generated enums and structs + /// Read an object of type `Self` from the input protocol object. fn read_thrift(prot: &mut R) -> Result where Self: Sized; @@ -494,6 +543,9 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for &'a [u8] { } } +/// Read a Thrift encoded [list] from the input protocol object. +/// +/// [list]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set pub(crate) fn read_thrift_vec<'a, T, R>(prot: &mut R) -> Result> where R: ThriftCompactInputProtocol<'a>, @@ -945,10 +997,7 @@ pub(crate) mod tests { pub(crate) fn test_roundtrip(val: T) where - T: for<'a> ReadThrift<'a, ThriftSliceInputProtocol<'a>> - + WriteThrift - + PartialEq - + Debug, + T: for<'a> ReadThrift<'a, ThriftSliceInputProtocol<'a>> + WriteThrift + PartialEq + Debug, { let buf = Vec::::new(); let mut writer = ThriftCompactOutputProtocol::new(buf); From 237ca3d1e71534905d1a72201e9acd0663625f9d Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 11 Sep 2025 07:56:00 -0700 Subject: [PATCH 082/126] add docs for ThriftReadInputProtocol --- parquet/src/parquet_thrift.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index c17f387b0caf..5d549f012c86 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -489,8 +489,10 @@ fn eof_error() -> ParquetError { eof_err!("Unexpected EOF") } -// input protocol that's only intended for use in reading page headers. not fully implemented -// so this shouldn't be used generally. +/// A Thrift input protocol that wraps a [`Read`] object. +/// +/// Note that this is only intended for use in reading Parquet page headers. This will panic +/// if Thrift `binary` data is encountered because a slice of that data cannot be returned. pub(crate) struct ThriftReadInputProtocol { reader: R, } From 2091e49afafaa819ff99f7300a9be52b634035f4 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 11 Sep 2025 08:12:55 -0700 Subject: [PATCH 083/126] move PageEncodingStats to thrift_gen --- parquet/src/column/writer/mod.rs | 2 +- parquet/src/file/metadata/memory.rs | 2 +- parquet/src/file/metadata/mod.rs | 6 ++-- parquet/src/file/metadata/thrift_gen.rs | 10 ++++++- parquet/src/file/mod.rs | 1 - parquet/src/file/page_encoding_stats.rs | 38 ------------------------- 6 files changed, 13 insertions(+), 46 deletions(-) delete mode 100644 parquet/src/file/page_encoding_stats.rs diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 984db62aa2b0..6321e0ff7b6d 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -21,6 +21,7 @@ use bytes::Bytes; use half::f16; use crate::bloom_filter::Sbbf; +use crate::file::metadata::thrift_gen::PageEncodingStats; use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::page_index::offset_index::OffsetIndexMetaData; use std::collections::{BTreeSet, VecDeque}; @@ -42,7 +43,6 @@ use crate::file::metadata::{ ColumnChunkMetaData, ColumnChunkMetaDataBuilder, ColumnIndexBuilder, LevelHistogram, OffsetIndexBuilder, }; -use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::properties::{ EnabledStatistics, WriterProperties, WriterPropertiesPtr, WriterVersion, }; diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 19122a1b5522..be21c9b170a6 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -20,10 +20,10 @@ //! [`ParquetMetadata::memory_size`]: crate::file::metadata::ParquetMetaData::memory_size use crate::basic::{BoundaryOrder, ColumnOrder, Compression, Encoding, PageType}; use crate::data_type::private::ParquetValueType; +use crate::file::metadata::thrift_gen::PageEncodingStats; use crate::file::metadata::{ ColumnChunkMetaData, FileMetaData, KeyValue, RowGroupMetaData, SortingColumn, }; -use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::page_index::column_index::{ ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex, }; diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index cf9f908e0023..57426aba6d30 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -98,12 +98,10 @@ use crate::encryption::decrypt::FileDecryptor; #[cfg(feature = "encryption")] use crate::file::column_crypto_metadata::ColumnCryptoMetaData; pub(crate) use crate::file::metadata::memory::HeapSize; +use crate::file::metadata::thrift_gen::PageEncodingStats; use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex}; +use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation}; use crate::file::statistics::Statistics; -use crate::file::{ - page_encoding_stats::PageEncodingStats, - page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation}, -}; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, Type as SchemaType, diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 48aeb2df38dd..2466e267d3c3 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -31,7 +31,6 @@ use crate::{ ColumnChunkMetaData, KeyValue, LevelHistogram, ParquetMetaData, RowGroupMetaData, SortingColumn, }, - page_encoding_stats::PageEncodingStats, statistics::ValueStatistics, }, parquet_thrift::{ @@ -52,6 +51,15 @@ use crate::{ schema::types::SchemaDescPtr, }; +thrift_struct!( +/// PageEncodingStats for a column chunk and data page. +pub struct PageEncodingStats { + 1: required PageType page_type; + 2: required Encoding encoding; + 3: required i32 count; +} +); + // this needs to be visible to the schema conversion code thrift_struct!( pub(crate) struct SchemaElement<'a> { diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index 976b36dc2358..09036cd7d7b9 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -100,7 +100,6 @@ #[cfg(feature = "encryption")] pub mod column_crypto_metadata; pub mod metadata; -pub mod page_encoding_stats; pub mod page_index; pub mod properties; pub mod reader; diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs deleted file mode 100644 index 9b414430f0eb..000000000000 --- a/parquet/src/file/page_encoding_stats.rs +++ /dev/null @@ -1,38 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Per-page encoding information. - -use std::io::Write; - -use crate::basic::{Encoding, PageType}; -use crate::errors::{ParquetError, Result}; -use crate::parquet_thrift::{ - ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, - WriteThrift, WriteThriftField, -}; -use crate::thrift_struct; - -// TODO(ets): This should probably all be moved to thrift_gen -thrift_struct!( -/// PageEncodingStats for a column chunk and data page. -pub struct PageEncodingStats { - 1: required PageType page_type; - 2: required Encoding encoding; - 3: required i32 count; -} -); From e0deed9cf062097fdf86e4e1619d10805138cac1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 15 Sep 2025 16:42:53 -0700 Subject: [PATCH 084/126] move PageEncodingStats to mod.rs since it needs to be public --- parquet/src/column/writer/mod.rs | 3 +-- parquet/src/file/metadata/memory.rs | 3 +-- parquet/src/file/metadata/mod.rs | 13 +++++++++++-- parquet/src/file/metadata/thrift_gen.rs | 13 ++----------- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 6321e0ff7b6d..93adac22277b 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -21,7 +21,6 @@ use bytes::Bytes; use half::f16; use crate::bloom_filter::Sbbf; -use crate::file::metadata::thrift_gen::PageEncodingStats; use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::page_index::offset_index::OffsetIndexMetaData; use std::collections::{BTreeSet, VecDeque}; @@ -41,7 +40,7 @@ use crate::encryption::encrypt::get_column_crypto_metadata; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ ColumnChunkMetaData, ColumnChunkMetaDataBuilder, ColumnIndexBuilder, LevelHistogram, - OffsetIndexBuilder, + OffsetIndexBuilder, PageEncodingStats, }; use crate::file::properties::{ EnabledStatistics, WriterProperties, WriterPropertiesPtr, WriterVersion, diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index be21c9b170a6..611486824998 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -20,9 +20,8 @@ //! [`ParquetMetadata::memory_size`]: crate::file::metadata::ParquetMetaData::memory_size use crate::basic::{BoundaryOrder, ColumnOrder, Compression, Encoding, PageType}; use crate::data_type::private::ParquetValueType; -use crate::file::metadata::thrift_gen::PageEncodingStats; use crate::file::metadata::{ - ColumnChunkMetaData, FileMetaData, KeyValue, RowGroupMetaData, SortingColumn, + ColumnChunkMetaData, FileMetaData, KeyValue, PageEncodingStats, RowGroupMetaData, SortingColumn, }; use crate::file::page_index::column_index::{ ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex, diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index ce15deae5c18..a027f7a476c4 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -93,12 +93,12 @@ pub(crate) mod reader; pub(crate) mod thrift_gen; mod writer; +use crate::basic::PageType; #[cfg(feature = "encryption")] use crate::encryption::decrypt::FileDecryptor; #[cfg(feature = "encryption")] use crate::file::column_crypto_metadata::ColumnCryptoMetaData; pub(crate) use crate::file::metadata::memory::HeapSize; -use crate::file::metadata::thrift_gen::PageEncodingStats; use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex}; use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation}; use crate::file::statistics::Statistics; @@ -420,7 +420,7 @@ impl From for ParquetMetaDataBuilder { } } -// TODO: should this move to thrift_gen? +// TODO(ets): should this move to thrift_gen? thrift_struct!( /// A key-value pair for [`FileMetaData`]. pub struct KeyValue { @@ -442,6 +442,15 @@ impl KeyValue { } } +thrift_struct!( +/// PageEncodingStats for a column chunk and data page. +pub struct PageEncodingStats { + 1: required PageType page_type; + 2: required Encoding encoding; + 3: required i32 count; +} +); + /// Reference counted pointer for [`FileMetaData`]. pub type FileMetaDataPtr = Arc; diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 2466e267d3c3..195616836393 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -28,8 +28,8 @@ use crate::{ errors::{ParquetError, Result}, file::{ metadata::{ - ColumnChunkMetaData, KeyValue, LevelHistogram, ParquetMetaData, RowGroupMetaData, - SortingColumn, + ColumnChunkMetaData, KeyValue, LevelHistogram, PageEncodingStats, ParquetMetaData, + RowGroupMetaData, SortingColumn, }, statistics::ValueStatistics, }, @@ -51,15 +51,6 @@ use crate::{ schema::types::SchemaDescPtr, }; -thrift_struct!( -/// PageEncodingStats for a column chunk and data page. -pub struct PageEncodingStats { - 1: required PageType page_type; - 2: required Encoding encoding; - 3: required i32 count; -} -); - // this needs to be visible to the schema conversion code thrift_struct!( pub(crate) struct SchemaElement<'a> { From 218b42b539d40ea2f6d90527a79b76b91e1608de Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 16 Sep 2025 10:16:22 -0700 Subject: [PATCH 085/126] cleanup and add some documentation --- parquet/src/parquet_macros.rs | 155 +++++++++++++--------------------- 1 file changed, 61 insertions(+), 94 deletions(-) diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index 6f9fa12ffeff..2ea3df46e49c 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -20,13 +20,22 @@ // They allow for pasting sections of the Parquet thrift IDL file // into a macro to generate rust structures and implementations. -// TODO(ets): These macros need a good bit of documentation so other developers will be able -// to use them correctly. Also need to write a .md file with complete examples of both how -// to use the macros, and how to implement custom readers and writers when necessary. +//! This is a collection of macros used to parse Thrift IDL descriptions of structs, +//! unions, and enums into their corresponding Rust types. These macros will also +//! generate the code necessary to serialize and deserialize to/from the [Thrift compact] +//! protocol. +//! +//! Further details of how to use them (and other aspects of the Thrift serialization process) +//! can be found in [THRIFT.md]. +//! +//! [Thrift compact]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set +//! [THRIFT.md]: https://github.com/apache/arrow-rs/blob/main/parquet/THRIFT.md #[macro_export] #[allow(clippy::crate_in_macro_def)] -/// macro to generate rust enums from a thrift enum definition +/// Macro used to generate rust enums from a Thrift `enum` definition. +/// +/// When utilizing this macro the Thrift serialization traits and structs need to be in scope. macro_rules! thrift_enum { ($(#[$($def_attrs:tt)*])* enum $identifier:ident { $($(#[$($field_attrs:tt)*])* $field_name:ident = $field_value:literal;)* }) => { $(#[$($def_attrs)*])* @@ -72,7 +81,16 @@ macro_rules! thrift_enum { } } -/// macro to generate rust enums for thrift unions where all fields are typed with empty structs +/// Macro used to generate Rust enums for Thrift unions in which all variants are typed with empty +/// structs. +/// +/// Because the compact protocol does not write any struct type information, these empty structs +/// become a single `0` (end-of-fields marker) upon serialization. Rather than trying to deserialize +/// an empty struct, we can instead simply read the `0` and discard it. +/// +/// The resulting Rust enum will have all unit variants. +/// +/// When utilizing this macro the Thrift serialization traits and structs need to be in scope. #[macro_export] #[allow(clippy::crate_in_macro_def)] macro_rules! thrift_union_all_empty { @@ -134,10 +152,17 @@ macro_rules! thrift_union_all_empty { } } -/// macro to generate rust enums for thrift unions where all variants are a mix of unit and tuple types. -/// this requires modifying the thrift IDL. For variants with empty structs as their type, -/// delete the typename (i.e. "1: EmptyStruct Var1;" => "1: Var1"). For variants with a non-empty -/// type, put the typename in parens (e.g" "1: Type Var1;" => "1: (Type) Var1;"). +/// Macro used to generate Rust enums for Thrift unions where variants are a mix of unit and +/// tuple types. +/// +/// Use of this macro requires modifying the thrift IDL. For variants with empty structs as their +/// type, delete the typename (i.e. `1: EmptyStruct Var1;` becomes `1: Var1`). For variants with a +/// non-empty type, the typename must be contained within parens (e.g. `1: MyType Var1;` becomes +/// `1: (MyType) Var1;`). +/// +/// This macro allows for specifying lifetime annotations for the resulting `enum` and its fields. +/// +/// When utilizing this macro the Thrift serialization traits and structs need to be in scope. #[macro_export] #[allow(clippy::crate_in_macro_def)] macro_rules! thrift_union { @@ -198,31 +223,11 @@ macro_rules! thrift_union { } } -#[doc(hidden)] -#[macro_export] -macro_rules! __thrift_write_variant_lhs { - ($field_name:ident $field_type:ident, $val:tt) => { - Self::$field_name($val) - }; - ($field_name:ident, $val:tt) => { - Self::$field_name - }; -} - -#[doc(hidden)] -#[macro_export] -macro_rules! __thrift_write_variant_rhs { - ($field_id:literal $field_type:ident, $writer:tt, $val:ident) => { - $val.write_thrift_field($writer, $field_id, 0)? - }; - ($field_id:literal, $writer:tt, $val:tt) => { - $writer.write_empty_struct($field_id, 0)? - }; -} - -/// macro to generate rust structs from a thrift struct definition -/// unlike enum and union, this macro will allow for visibility specifier -/// can also take optional lifetime for struct and elements within it (need e.g.) +/// Macro used to generate Rust structs from a Thrift `struct` definition. +/// +/// This macro allows for specifying lifetime annotations for the resulting `struct` and its fields. +/// +/// When utilizing this macro the Thrift serialization traits and structs need to be in scope. #[macro_export] macro_rules! thrift_struct { ($(#[$($def_attrs:tt)*])* $vis:vis struct $identifier:ident $(< $lt:lifetime >)? { $($(#[$($field_attrs:tt)*])* $field_id:literal : $required_or_optional:ident $field_type:ident $(< $field_lt:lifetime >)? $(< $element_type:ident >)? $field_name:ident $(= $default_value:literal)? $(;)?)* }) => { @@ -284,66 +289,6 @@ macro_rules! thrift_struct { } } -/// only implements ReadThrift for the give IDL struct definition -#[macro_export] -macro_rules! thrift_struct_read_impl { - ($(#[$($def_attrs:tt)*])* $vis:vis struct $identifier:ident $(< $lt:lifetime >)? { $($(#[$($field_attrs:tt)*])* $field_id:literal : $required_or_optional:ident $field_type:ident $(< $field_lt:lifetime >)? $(< $element_type:ident >)? $field_name:ident $(= $default_value:literal)? $(;)?)* }) => { - $(#[cfg_attr(not(doctest), $($def_attrs)*)])* - impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier $(<$lt>)? { - fn read_thrift(prot: &mut R) -> Result { - $(let mut $field_name: Option<$crate::__thrift_field_type!($field_type $($field_lt)? $($element_type)?)> = None;)* - let mut last_field_id = 0i16; - loop { - let field_ident = prot.read_field_begin(last_field_id)?; - if field_ident.field_type == FieldType::Stop { - break; - } - match field_ident.id { - $($field_id => { - let val = $crate::__thrift_read_field!(prot, field_ident, $field_type $($field_lt)? $($element_type)?); - $field_name = Some(val); - })* - _ => { - prot.skip(field_ident.field_type)?; - } - }; - last_field_id = field_ident.id; - } - $($crate::__thrift_result_required_or_optional!($required_or_optional $field_name);)* - Ok(Self { - $($field_name),* - }) - } - } - } -} - -/// only implements WriteThrift for the give IDL struct definition -#[macro_export] -macro_rules! thrift_struct_write_impl { - ($(#[$($def_attrs:tt)*])* $vis:vis struct $identifier:ident $(< $lt:lifetime >)? { $($(#[$($field_attrs:tt)*])* $field_id:literal : $required_or_optional:ident $field_type:ident $(< $field_lt:lifetime >)? $(< $element_type:ident >)? $field_name:ident $(= $default_value:literal)? $(;)?)* }) => { - impl $(<$lt>)? WriteThrift for $identifier $(<$lt>)? { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - #[allow(unused_assignments)] - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - #[allow(unused_mut, unused_variables)] - let mut last_field_id = 0i16; - $($crate::__thrift_write_required_or_optional_field!($required_or_optional $field_name, $field_id, $field_type, self, writer, last_field_id);)* - writer.write_struct_end() - } - } - - impl $(<$lt>)? WriteThriftField for $identifier $(<$lt>)? { - fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol, field_id: i16, last_field_id: i16) -> Result { - writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?; - self.write_thrift(writer)?; - Ok(field_id) - } - } - } -} - #[doc(hidden)] #[macro_export] macro_rules! __thrift_write_required_or_optional_field { @@ -510,3 +455,25 @@ macro_rules! __thrift_read_variant { Self::$field_name }}; } + +#[doc(hidden)] +#[macro_export] +macro_rules! __thrift_write_variant_lhs { + ($field_name:ident $field_type:ident, $val:tt) => { + Self::$field_name($val) + }; + ($field_name:ident, $val:tt) => { + Self::$field_name + }; +} + +#[doc(hidden)] +#[macro_export] +macro_rules! __thrift_write_variant_rhs { + ($field_id:literal $field_type:ident, $writer:tt, $val:ident) => { + $val.write_thrift_field($writer, $field_id, 0)? + }; + ($field_id:literal, $writer:tt, $val:tt) => { + $writer.write_empty_struct($field_id, 0)? + }; +} From 67a82f48c045216803702c8a3417bd473491063c Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 16 Sep 2025 10:51:33 -0700 Subject: [PATCH 086/126] start on documentation of thrift processing --- parquet/THRIFT.md | 100 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 parquet/THRIFT.md diff --git a/parquet/THRIFT.md b/parquet/THRIFT.md new file mode 100644 index 000000000000..914161ffb8b2 --- /dev/null +++ b/parquet/THRIFT.md @@ -0,0 +1,100 @@ + + +# Thrift serialization in the parquet crate + +For both performance and flexibility reasons, this crate uses custom Thrift parsers and +serialization mechanisms. For many of the objects defined by the Parquet specification macros +are used to generate the objects as well as the code to serialize them. But in certain instances +(performance bottlenecks, additions to the spec, etc.),it becomes necessary to implement the +serialization code manually. This document serves to document both the standard usage of the +Thrift macros, as well as how to implement custom encoders and decoders. + +## Thrift macros + +The Parquet specification utilizes Thrift enums, unions, and structs, defined by an Interface +Description Language (IDL). This IDL is usually parsed by a Thrift code generator to produce +language specific structures and serialization/deserialization code. This crate, however, uses +Rust macros do perform the same function. This allows for customizations that produce more +performant code, as well as the ability to pick and choose which fields to process. + +### Enums + +Thrift enums are the simplest structure, and are logically identical to Rust enums with unit +variants. The IDL description will look like +``` +enum Type { + BOOLEAN = 0; + INT32 = 1; + INT64 = 2; + INT96 = 3; + FLOAT = 4; + DOUBLE = 5; + BYTE_ARRAY = 6; + FIXED_LEN_BYTE_ARRAY = 7; +} +``` +The `thrift_enum` macro can be used in this instance. +```rust +thrift_enum!( + enum Type { + BOOLEAN = 0; + INT32 = 1; + INT64 = 2; + INT96 = 3; + FLOAT = 4; + DOUBLE = 5; + BYTE_ARRAY = 6; + FIXED_LEN_BYTE_ARRAY = 7; +} +); +``` +which will produce a public Rust enum +```rust +pub enum Type { + BOOLEAN, + INT32, + INT64, + INT96, + FLOAT, + DOUBLE, + BYTE_ARRAY, + FIXED_LEN_BYTE_ARRAY, +} +``` + +### Unions + +### Structs + +## Serialization traits + +### ReadThrift + +### WriteThrift + +### WriteThrftField + +## I/O + +### Readers + +### Writers + +## Customization \ No newline at end of file From 1c71b42260c381c09bc333f7ff92f7bb2d24e519 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 16 Sep 2025 14:00:38 -0700 Subject: [PATCH 087/126] more docs --- parquet/THRIFT.md | 326 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 317 insertions(+), 9 deletions(-) diff --git a/parquet/THRIFT.md b/parquet/THRIFT.md index 914161ffb8b2..d01c6af1c114 100644 --- a/parquet/THRIFT.md +++ b/parquet/THRIFT.md @@ -22,7 +22,7 @@ For both performance and flexibility reasons, this crate uses custom Thrift parsers and serialization mechanisms. For many of the objects defined by the Parquet specification macros are used to generate the objects as well as the code to serialize them. But in certain instances -(performance bottlenecks, additions to the spec, etc.),it becomes necessary to implement the +(performance bottlenecks, additions to the spec, etc.),it becomes necessary to implement the serialization code manually. This document serves to document both the standard usage of the Thrift macros, as well as how to implement custom encoders and decoders. @@ -31,13 +31,14 @@ Thrift macros, as well as how to implement custom encoders and decoders. The Parquet specification utilizes Thrift enums, unions, and structs, defined by an Interface Description Language (IDL). This IDL is usually parsed by a Thrift code generator to produce language specific structures and serialization/deserialization code. This crate, however, uses -Rust macros do perform the same function. This allows for customizations that produce more +Rust macros do perform the same function. This allows for customizations that produce more performant code, as well as the ability to pick and choose which fields to process. ### Enums Thrift enums are the simplest structure, and are logically identical to Rust enums with unit variants. The IDL description will look like + ``` enum Type { BOOLEAN = 0; @@ -50,7 +51,9 @@ enum Type { FIXED_LEN_BYTE_ARRAY = 7; } ``` + The `thrift_enum` macro can be used in this instance. + ```rust thrift_enum!( enum Type { @@ -65,7 +68,9 @@ thrift_enum!( } ); ``` + which will produce a public Rust enum + ```rust pub enum Type { BOOLEAN, @@ -81,20 +86,323 @@ pub enum Type { ### Unions +Thrift unions are a special kind of struct in which only a single field is populated. In this +regard they are much like Rust enums which can have a mix of unit and tuple variants. Because of +this flexibility, specifying unions is a little bit trickier. + +Often times a union will be defined for which all the variants are typed with empty structs. For +example the `TimeUnit` union used for `LogicalType`s. + +``` +struct MilliSeconds {} +struct MicroSeconds {} +struct NanoSeconds {} +union TimeUnit { + 1: MilliSeconds MILLIS + 2: MicroSeconds MICROS + 3: NanoSeconds NANOS +} +``` + +When serialized, these empty structs become a single `0` (to mark the end of the struct). As an +optimization, and to allow for a simpler interface, the `thrift_union_all_empty` macro can be used. + +```rust +thrift_union_all_empty!( +union TimeUnit { + 1: MilliSeconds MILLIS + 2: MicroSeconds MICROS + 3: NanoSeconds NANOS +} +); +``` + +This macro will ignore the types specified for each variant, and will produce the following Rust +`enum`: + +```rust +pub enum TimeUnit { + MILLIS, + MICROS, + NANOS, +} +``` + +For unions with mixed variant types, some modifications to the IDL are necessary. Take the +definition of `ColumnCryptoMetadata`. + +``` +struct EncryptionWithFooterKey { +} + +struct EncryptionWithColumnKey { + /** Column path in schema **/ + 1: required list path_in_schema + + /** Retrieval metadata of column encryption key **/ + 2: optional binary key_metadata +} + +union ColumnCryptoMetaData { + 1: EncryptionWithFooterKey ENCRYPTION_WITH_FOOTER_KEY + 2: EncryptionWithColumnKey ENCRYPTION_WITH_COLUMN_KEY +} +``` + +The `ENCRYPTION_WITH_FOOTER_KEY` variant is types with an empty struct, while +`ENCRYPTION_WITH_COLUMN_KEY` has the type of a struct with fields. In this case, the `thrift_union` +macro is used. + +```rust +thrift_union!( +union ColumnCryptoMetaData { + 1: ENCRYPTION_WITH_FOOTER_KEY + 2: (EncryptionWithColumnKey) ENCRYPTION_WITH_COLUMN_KEY +} +); +``` + +Here, the type has been omitted for `ENCRYPTION_WITH_FOOTER_KEY` to indicate it should be a unit +variant, while the type for `ENCRYPTION_WITH_COLUMN_KEY` is enclosed in parens. The parens are +necessary to provide a semantic clue to the macro that the identifier is a type. The above will +produce the Rust enum + +```rust +pub enum ColumnCryptoMetaData { + ENCRYPTION_WITH_FOOTER_KEY, + ENCRYPTION_WITH_COLUMN_KEY(EncryptionWithColumnKey), +} +``` + ### Structs +The `thrift_struct` macro is used for structs. This macro is a little more flexible than the others +because it allows for the visibility to be specified, and also allows for lifetimes to be specified +for the defined structs as well as their fields. An example of this is the `SchemaElement` struct. +This is defined in this crate as + +```rust +thrift_struct!( +pub(crate) struct SchemaElement<'a> { + 1: optional Type type_; + 2: optional i32 type_length; + 3: optional Repetition repetition_type; + 4: required string<'a> name; + 5: optional i32 num_children; + 6: optional ConvertedType converted_type; + 7: optional i32 scale + 8: optional i32 precision + 9: optional i32 field_id; + 10: optional LogicalType logical_type +} +); +``` + +Here the `string` field `name` is given a lifetime annotation, which is then propagated to the +struct definition. Without this annotation, the resultant field would be a `String` type, rather +than a string slice. The visibility of this struct (and all fields) will be `pub(crate)`. The +resultant Rust struct will be + +```rust +pub(crate) struct SchemaElement<'a> { + pub(crate) type_: Type, // here we've changed the name `type` to `type_` to avoid reserved words + pub(crate) type_length: i32, + pub(crate) repetition_type: Repetition, + pub(crate) name: &'a str, + ... +} +``` + +The lifetime annotations can also be added to list elements, as in + +```rust +thrift_struct!( +struct FileMetaData<'a> { + /** Version of this file **/ + 1: required i32 version + 2: required list<'a> schema; + 3: required i64 num_rows + 4: required list<'a> row_groups + 5: optional list key_value_metadata + 6: optional string created_by + 7: optional list column_orders; + 8: optional EncryptionAlgorithm encryption_algorithm + 9: optional binary footer_signing_key_metadata +} +); +``` + +Note that the lifetime annotation precedes the element type specification. + ## Serialization traits -### ReadThrift +Serialization is performed via several Rust traits. On the deserialization, objects implement +the `ReadThrift` trait. This defines a `read_thrift` function that takes a +`ThriftCompactInputProtocol` I/O object as an argument. The `read_thrift` function performs +all steps necessary to deserialize the object from the input stream, and is usually produced by +one of the macros mentioned above. + +On the serialization side, the `WriteThrift` and `WriteThriftField` traits are used in conjunction +with a `ThriftCompactOutputProtocol` struct. As above, the Thrift macros produce the necessary +implementations needed to perform serialization. -### WriteThrift +While the macros can be used in most circumstances, sometimes more control is needed. The following +sections provide information on how to provide custom implementations for the serialization +traits. -### WriteThrftField +### ReadThrift Customization -## I/O +Thrift enums are serialized as a single `i32` value. The process of reading an enum is straightforward: +read the enum discriminant, and then match on the possible values. For instance, reading the +`ConvertedType` enum becomes: -### Readers +```rust +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ConvertedType { + fn read_thrift(prot: &mut R) -> Result { + let val = prot.read_i32()?; + Ok(match val { + 0 => Self::UTF8, + 1 => Self::MAP, + 2 => Self::MAP_KEY_VALUE, + ... + 21 => Self::INTERVAL, + _ => return Err(general_err!("Unexpected ConvertedType {}", val)), + }) + } +} +``` -### Writers +The default behavior is to return an error when an unexpected field is encountered. One could, +however, provide an `Unknown` variant if forwards compatibility is neeeded in the case of an +evolving enum. + +Deserializing structs is more involved, but still fairly easy. A thrift struct is serialized as +repeated `(field_id,field_type,field)` tuples. The `field_id` and `field_type` usually occupy a +single byte, followed by the Thrift encoded field. Because only 4 bits are available for the id, +encoders usually will instead use deltas from the preceding field. If the delta will exceed 15, +then the `field_id` nybble will be set to `0`, and the `field_id` will instead be encoded as a +varint, following the `field_type`. Fields will generally be read in a loop, with the `field_id` +and `field_type` read first, and then the `field_id` used to determine which field to read. +When a `field_id` of `0` is encountered, this marks the end of the struct and processing ceases. +Here is an example of the processing loop: + +```rust + let mut last_field_id = 0i16; + loop { + // read the field id and field type. break if we encounter `Stop` + let field_ident = prot.read_field_begin(last_field_id)?; + if field_ident.field_type == FieldType::Stop { + break; + } + // match on the field id + match field_ident.id { + 1 => { + let val = i32::read_thrift(&mut *prot)?; + num_values = Some(val); + } + 2 => { + let val = Encoding::read_thrift(&mut *prot)?; + encoding = Some(val); + } + 3 => { + let val = Encoding::read_thrift(&mut *prot)?; + definition_level_encoding = Some(val); + } + 4 => { + let val = Encoding::read_thrift(&mut *prot)?; + repetition_level_encoding = Some(val); + } + // Thrift structs are meant to be forward compatible, so do not error + // here. Instead, simply skip unknown fields. + _ => { + prot.skip(field_ident.field_type)?; + } + }; + // set the last seen field id to calculate the next field_id + last_field_id = field_ident.id; + } +``` + +Thrift unions are encoded as structs, but only a single field will be encoded. The loop above +can be eliminated, and only the `match` on the id performed. A subsequent call to +`read_field_begin` must return `Stop`, or an error should be returned. Here's an example from +the decoding of the `LogicalType` union: + +```rust + // read the discriminant, error if it is `0` + let field_ident = prot.read_field_begin(0)?; + if field_ident.field_type == FieldType::Stop { + return Err(general_err!("received empty union from remote LogicalType")); + } + let ret = match field_ident.id { + 1 => { + prot.skip_empty_struct()?; + Self::String + } + ... + _ => { + // LogicalType needs to be forward compatible, so we have defined a `_Unknown` + // variant for it. This can return an error if forward compatibility is not desired. + prot.skip(field_ident.field_type)?; + Self::_Unknown { + field_id: field_ident.id, + } + } + }; + // test to ensure there is only one field present + let field_ident = prot.read_field_begin(field_ident.id)?; + if field_ident.field_type != FieldType::Stop { + return Err(general_err!( + "Received multiple fields for union from remote LogicalType" + )); + } +``` + +### WriteThrift Customization + +On the serialization side, there are two traits to implement. The first, `WriteThrift`, is used +for actually serializing the object. The other, `WriteThriftField`, handles serializing objects +as struct fields. + +Serializing enums is a simple as writing the discriminant as an `i32`. For example, here is the +custom serialization code for `ConvertedType`: + +```rust +impl WriteThrift for ConvertedType { + const ELEMENT_TYPE: ElementType = ElementType::I32; + + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + // because we've added NONE, the variant values are off by 1, so correct that here + writer.write_i32(*self as i32 - 1) + } +} +``` + +Structs and unions are serialized by field. When performing the serialization, one needs to keep +track of the last field that has been written, as this is needed to calculate the delta in the +Thrift field header. For required fields this is not strictly necessary, but when writing +optional fields it is. A typical `write_thrift` implementation will look like: + +```rust + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + // required field f1 + self.f1.write_thrift_field(writer, 1, 0)?; // field_id == 1, last_field_id == 0 + // required field f2 + self.f2.write_thrift_field(writer, 2, 1)?; // field_id == 2, last_field_id == 1 + // final required field f3, we now save the last_field_id, which is returned by write_thrift_field + let mut last_field_id = self.f3.write_thrift_field(writer, 3, 2)?; // field_id == 3, last_field_id == 2 + + // optional field f4 + if let Some(val) = self.f4.as_ref() { + last_field_id = val.write_thrift_field(writer, 4, last_field_id)?; + } + // optional field f5 + if let Some(val) = self.f5.as_ref() { + last_field_id = val.write_thrift_field(writer, 5, last_field_id)?; + } + // write end of struct + writer.write_struct_end() + } +``` -## Customization \ No newline at end of file +### Handling for lists From 49813eab4865738e7b1d08f2b716f3ab1232c4b5 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 16 Sep 2025 14:20:42 -0700 Subject: [PATCH 088/126] finish first cut of THRIFT.md --- parquet/THRIFT.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/parquet/THRIFT.md b/parquet/THRIFT.md index d01c6af1c114..642040454f74 100644 --- a/parquet/THRIFT.md +++ b/parquet/THRIFT.md @@ -406,3 +406,41 @@ optional fields it is. A typical `write_thrift` implementation will look like: ``` ### Handling for lists + +Lists of serialized objects can usually be read using `parquet_thrift::read_thrift_vec` and written +using the `WriteThrift::write_thrift` implementation for vectors of objects that implement +`WriteThrift`. + +When reading a list, one first reads the list header which will provide the number of elements +that have been encoded, and then read elements one at a time. + +```rust + // read the list header + let list_ident = prot.read_list_begin()?; + // allocate vector with enough capacity + let mut page_locations = Vec::with_capacity(list_ident.size as usize); + // read elements + for _ in 0..list_ident.size { + page_locations.push(read_page_location(prot)?); + } +``` + +Writing is simply the reverse...write the list header, and then serialize the elements: + +```rust + // write the list header + writer.write_list_begin(ElementType::Struct, page_locations.len)?; + // write the elements + for i in 0..len { + page_locations[i].write_thrift(writer)?; + } +``` + +## More examples + +For more examples, the easiest thing to do is to [expand](https://github.com/dtolnay/cargo-expand) +the thrift macros. For instance, to see the implementations generated in the `basic` module, type: + +```sh +% cargo expand -p parquet --lib --all-features basic +``` From 5298257f2b79b682c72801c009fa127fbc62ee2e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 16 Sep 2025 16:16:47 -0700 Subject: [PATCH 089/126] clean up some stale documentation references --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/arrow/arrow_reader/selection.rs | 2 +- parquet/src/file/metadata/mod.rs | 8 ++++---- parquet/src/file/properties.rs | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index fb29bdf6561a..0eb533a15a81 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -261,7 +261,7 @@ impl ArrowReaderBuilder { /// Skip 1100 (skip the remaining 900 rows in row group 2 and the first 200 rows in row group 3) /// ``` /// - /// [`Index`]: crate::file::page_index::index::Index + /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData pub fn with_row_selection(self, selection: RowSelection) -> Self { Self { selection: Some(selection), diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 9f06dc184b6c..21ed97b8bde1 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -97,7 +97,7 @@ impl RowSelector { /// * It contains no [`RowSelector`] of 0 rows /// * Consecutive [`RowSelector`]s alternate skipping or selecting rows /// -/// [`PageIndex`]: crate::file::page_index::index::PageIndex +/// [`PageIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData #[derive(Debug, Clone, Default, Eq, PartialEq)] pub struct RowSelection { selectors: Vec, diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index a027f7a476c4..88f098f4be07 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -134,18 +134,18 @@ pub(crate) use writer::ThriftMetadataWriter; /// /// This structure is an in-memory representation of multiple [`ColumnIndex`] /// structures in a parquet file footer, as described in the Parquet [PageIndex -/// documentation]. Each [`Index`] holds statistics about all the pages in a +/// documentation]. Each [`ColumnIndex`] holds statistics about all the pages in a /// particular column chunk. /// /// `column_index[row_group_number][column_number]` holds the -/// [`Index`] corresponding to column `column_number` of row group +/// [`ColumnIndex`] corresponding to column `column_number` of row group /// `row_group_number`. /// -/// For example `column_index[2][3]` holds the [`Index`] for the fourth +/// For example `column_index[2][3]` holds the [`ColumnIndex`] for the fourth /// column in the third row group of the parquet file. /// /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [`ColumnIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData pub type ParquetColumnIndex = Vec>; /// [`OffsetIndexMetaData`] for each data page of each row group of each column diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 17f77f919dc6..abc3bd8b766b 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -639,7 +639,7 @@ impl WriterPropertiesBuilder { /// * If `Some`, must be greater than 0, otherwise will panic /// * If `None`, there's no effective limit. /// - /// [`Index`]: crate::file::page_index::index::Index + /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData pub fn set_column_index_truncate_length(mut self, max_length: Option) -> Self { if let Some(value) = max_length { assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."); From fd63d320c6370b0805fb592611ae7ef868b99852 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 16 Sep 2025 16:27:14 -0700 Subject: [PATCH 090/126] add a todo --- parquet/src/basic.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 9153e28abc04..99f2dd7a8e5d 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -280,6 +280,9 @@ struct GeographyType<'a> { } ); +// TODO(ets): should we switch to tuple variants so we can use +// the thrift macros? + /// Logical types used by version 2.4.0+ of the Parquet format. /// /// This is an *entirely new* struct as of version @@ -586,7 +589,6 @@ impl WriteThriftField for LogicalType { // ---------------------------------------------------------------------- // Mirrors thrift enum `FieldRepetitionType` // -// Cannot use macro since the name is changed thrift_enum!( /// Representation of field types in schema. From 72ea8504adfc72c8566286abcd684db5280a6c92 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 16 Sep 2025 16:36:04 -0700 Subject: [PATCH 091/126] more doc cleanup --- parquet/src/file/metadata/thrift_gen.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 195616836393..24c90d20f1ae 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -846,8 +846,8 @@ pub(crate) struct DictionaryPageHeader { ); // Statistics for the page header. This is separate because of the differing lifetime requirements -// for page handling vs column chunk. Once we start writing column chunks this might need to be -// revisited. +// for page handling vs column chunk. In particular, the `ThriftReadInputProtocol` used for page +// header reading cannot return `binary` data as slices. thrift_struct!( pub(crate) struct PageStatistics { 1: optional binary max; From 7560e705ae66e31b4e356be6a0fa57962530444f Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 17 Sep 2025 12:50:27 -0700 Subject: [PATCH 092/126] fix typo --- parquet/src/file/properties.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 17f77f919dc6..b6003dc4d9dc 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -1299,7 +1299,7 @@ impl ReaderPropertiesBuilder { /// each page, if present. /// If set to `false`, then the reader will skip decoding the statistics. /// - /// Byte default statistics will not be decoded. + /// By default statistics will not be decoded. /// /// [`Statistics`]: crate::file::statistics::Statistics pub fn set_read_page_statistics(mut self, value: bool) -> Self { From e94a2de6c0141586901668e1591513c70bd0b797 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 17 Sep 2025 12:53:05 -0700 Subject: [PATCH 093/126] fix typo --- parquet/src/parquet_macros.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs index b7608ca84994..5720fd4ce0e7 100644 --- a/parquet/src/parquet_macros.rs +++ b/parquet/src/parquet_macros.rs @@ -452,7 +452,7 @@ macro_rules! __thrift_required_or_optional { } // Performance note: using `expect` here is about 4% faster on the page index bench, -// but we want to propogate errors. Using `ok_or` is *much* slower. +// but we want to propagate errors. Using `ok_or` is *much* slower. #[doc(hidden)] #[macro_export] macro_rules! __thrift_result_required_or_optional { From 56a75d6f745c639de058873584c6b6094e00fcc3 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 17 Sep 2025 14:23:41 -0700 Subject: [PATCH 094/126] clean up some imports --- parquet/src/file/serialized_reader.rs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 15b6c6be65e0..1442f0f67ca0 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -33,6 +33,9 @@ use crate::file::{ properties::{ReaderProperties, ReaderPropertiesPtr}, reader::*, }; +#[cfg(feature = "encryption")] +use crate::parquet_thrift::ThriftSliceInputProtocol; +use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; @@ -734,8 +737,6 @@ impl SerializedPageReaderContext { _page_index: usize, _dictionary_page: bool, ) -> Result { - use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; - let mut prot = ThriftReadInputProtocol::new(input); if self.read_stats { Ok(PageHeader::read_thrift(&mut prot)?) @@ -764,8 +765,6 @@ impl SerializedPageReaderContext { ) -> Result { match self.page_crypto_context(page_index, dictionary_page) { None => { - use crate::parquet_thrift::{ReadThrift, ThriftReadInputProtocol}; - let mut prot = ThriftReadInputProtocol::new(input); if self.read_stats { Ok(PageHeader::read_thrift(&mut prot)?) @@ -776,8 +775,6 @@ impl SerializedPageReaderContext { } } Some(page_crypto_context) => { - use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; - let data_decryptor = page_crypto_context.data_decryptor(); let aad = page_crypto_context.create_page_header_aad()?; @@ -790,12 +787,8 @@ impl SerializedPageReaderContext { let mut prot = ThriftSliceInputProtocol::new(buf.as_slice()); if self.read_stats { - use crate::file::metadata::thrift_gen::PageHeader; - Ok(PageHeader::read_thrift(&mut prot)?) } else { - use crate::file::metadata::thrift_gen::PageHeader; - Ok(PageHeader::read_thrift_without_stats(&mut prot)?) } } From b7a135b99e8611716116e2d73cd0e08615902875 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 18 Sep 2025 10:19:24 -0700 Subject: [PATCH 095/126] increment shift after test --- parquet/src/parquet_thrift.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index 5d549f012c86..801b500746c3 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -212,10 +212,10 @@ pub(crate) trait ThriftCompactInputProtocol<'a> { loop { let byte = self.read_byte()?; in_progress |= ((byte & 0x7F) as u64).wrapping_shl(shift); - shift += 7; if byte & 0x80 == 0 { return Ok(in_progress); } + shift += 7; } } From 7b549f981639960023b81a6ca6f3a14c79c75325 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 23 Sep 2025 11:08:44 -0700 Subject: [PATCH 096/126] update docs for PageStatistics --- parquet/src/file/metadata/thrift_gen.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 06223bf03af8..7515a70a63f1 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -836,10 +836,14 @@ pub(crate) struct DictionaryPageHeader { } ); -// Statistics for the page header. This is separate because of the differing lifetime requirements -// for page handling vs column chunk. Once we start writing column chunks this might need to be -// revisited. thrift_struct!( +/// Statistics for the page header. +/// +/// This is a duplicate of the [`Statistics`] struct above. Because the page reader uses +/// the [`Read`] API, we cannot read the min/max values as slices. This should not be +/// a huge problem since this crate no longer reads the page header statistics by default. +/// +/// [`Read`]: crate::parquet_thrift::ThriftReadInputProtocol pub(crate) struct PageStatistics { 1: optional binary max; 2: optional binary min; From 0701d60ea79e779542e7324378e649d229d31437 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 23 Sep 2025 12:07:35 -0700 Subject: [PATCH 097/126] backport some doc fixes --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/arrow/arrow_reader/selection.rs | 2 +- parquet/src/file/metadata/mod.rs | 8 ++++---- parquet/src/file/properties.rs | 2 +- parquet/src/file/statistics.rs | 9 ++++----- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 44e0441ac99f..ff221656a302 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -261,7 +261,7 @@ impl ArrowReaderBuilder { /// Skip 1100 (skip the remaining 900 rows in row group 2 and the first 200 rows in row group 3) /// ``` /// - /// [`Index`]: crate::file::page_index::index::Index + /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData pub fn with_row_selection(self, selection: RowSelection) -> Self { Self { selection: Some(selection), diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 9f06dc184b6c..21ed97b8bde1 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -97,7 +97,7 @@ impl RowSelector { /// * It contains no [`RowSelector`] of 0 rows /// * Consecutive [`RowSelector`]s alternate skipping or selecting rows /// -/// [`PageIndex`]: crate::file::page_index::index::PageIndex +/// [`PageIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData #[derive(Debug, Clone, Default, Eq, PartialEq)] pub struct RowSelection { selectors: Vec, diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index e279a7b4a847..caf001e5fa27 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -140,18 +140,18 @@ pub(crate) use writer::ThriftMetadataWriter; /// /// This structure is an in-memory representation of multiple [`ColumnIndex`] /// structures in a parquet file footer, as described in the Parquet [PageIndex -/// documentation]. Each [`Index`] holds statistics about all the pages in a +/// documentation]. Each [`ColumnIndex`] holds statistics about all the pages in a /// particular column chunk. /// /// `column_index[row_group_number][column_number]` holds the -/// [`Index`] corresponding to column `column_number` of row group +/// [`ColumnIndex`] corresponding to column `column_number` of row group /// `row_group_number`. /// -/// For example `column_index[2][3]` holds the [`Index`] for the fourth +/// For example `column_index[2][3]` holds the [`ColumnIndex`] for the fourth /// column in the third row group of the parquet file. /// /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -/// [`ColumnIndex`]: crate::format::ColumnIndex +/// [`ColumnIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData pub type ParquetColumnIndex = Vec>; /// [`OffsetIndexMetaData`] for each data page of each row group of each column diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index b6003dc4d9dc..a76db6465602 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -639,7 +639,7 @@ impl WriterPropertiesBuilder { /// * If `Some`, must be greater than 0, otherwise will panic /// * If `None`, there's no effective limit. /// - /// [`Index`]: crate::file::page_index::index::Index + /// [`Index`]: crate::file::page_index::column_index::ColumnIndexMetaData pub fn set_column_index_truncate_length(mut self, max_length: Option) -> Self { if let Some(value) = max_length { assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."); diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index e51f445b7e7e..38c0d1ff06a0 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -518,15 +518,14 @@ pub(crate) fn page_stats_to_thrift(stats: Option<&Statistics>) -> Option Date: Tue, 23 Sep 2025 12:42:05 -0700 Subject: [PATCH 098/126] fix recently added test --- parquet/tests/encryption/encryption_async.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet/tests/encryption/encryption_async.rs b/parquet/tests/encryption/encryption_async.rs index 5b9534231d7d..0a096e6e4183 100644 --- a/parquet/tests/encryption/encryption_async.rs +++ b/parquet/tests/encryption/encryption_async.rs @@ -34,9 +34,9 @@ use parquet::arrow::{ArrowWriter, AsyncArrowWriter}; use parquet::encryption::decrypt::FileDecryptionProperties; use parquet::encryption::encrypt::FileEncryptionProperties; use parquet::errors::ParquetError; +use parquet::file::metadata::ParquetMetaData; use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder}; use parquet::file::writer::SerializedFileWriter; -use parquet::format::FileMetaData; use std::io::Write; use std::sync::Arc; use tokio::fs::File; @@ -647,7 +647,7 @@ fn spawn_column_parallel_row_group_writer( async fn concatenate_parallel_row_groups( mut parquet_writer: SerializedFileWriter, mut serialize_rx: Receiver>, -) -> Result { +) -> Result { while let Some(task) = serialize_rx.recv().await { let result = task.await; let mut rg_out = parquet_writer.next_row_group()?; @@ -818,8 +818,8 @@ async fn test_multi_threaded_encrypted_writing() { let metadata = serialized_file_writer.close().unwrap(); // Close the file writer which writes the footer - assert_eq!(metadata.num_rows, 50); - assert_eq!(metadata.schema, metadata.schema); + assert_eq!(metadata.file_metadata().num_rows(), 50); + // TODO(ets): what was this meant to test? assert_eq!(metadata.schema, metadata.schema); // Check that the file was written correctly let (read_record_batches, read_metadata) = From 4977f2fc84c6214c8a774849cd0d4f0710250c70 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 23 Sep 2025 13:02:14 -0700 Subject: [PATCH 099/126] fix recently added test --- parquet/tests/encryption/encryption_async.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/parquet/tests/encryption/encryption_async.rs b/parquet/tests/encryption/encryption_async.rs index 5b9534231d7d..8d3b1e960be9 100644 --- a/parquet/tests/encryption/encryption_async.rs +++ b/parquet/tests/encryption/encryption_async.rs @@ -34,9 +34,9 @@ use parquet::arrow::{ArrowWriter, AsyncArrowWriter}; use parquet::encryption::decrypt::FileDecryptionProperties; use parquet::encryption::encrypt::FileEncryptionProperties; use parquet::errors::ParquetError; +use parquet::file::metadata::ParquetMetaData; use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder}; use parquet::file::writer::SerializedFileWriter; -use parquet::format::FileMetaData; use std::io::Write; use std::sync::Arc; use tokio::fs::File; @@ -647,7 +647,7 @@ fn spawn_column_parallel_row_group_writer( async fn concatenate_parallel_row_groups( mut parquet_writer: SerializedFileWriter, mut serialize_rx: Receiver>, -) -> Result { +) -> Result { while let Some(task) = serialize_rx.recv().await { let result = task.await; let mut rg_out = parquet_writer.next_row_group()?; @@ -818,8 +818,10 @@ async fn test_multi_threaded_encrypted_writing() { let metadata = serialized_file_writer.close().unwrap(); // Close the file writer which writes the footer - assert_eq!(metadata.num_rows, 50); - assert_eq!(metadata.schema, metadata.schema); + assert_eq!(metadata.file_metadata().num_rows(), 50); + // TODO(ets): what was this meant to test? The read and written schemas differ because an + // archaic form for a list was used in the source file. + // assert_eq!(metadata.schema, metadata.schema); // Check that the file was written correctly let (read_record_batches, read_metadata) = @@ -910,7 +912,9 @@ async fn test_multi_threaded_encrypted_writing_deprecated() { // Close the file writer which writes the footer let metadata = writer.finish().unwrap(); assert_eq!(metadata.file_metadata().num_rows(), 100); - // TODO(ets): wut? assert_eq!(metadata.schema, metadata.schema); + // TODO(ets): what was this meant to test? The read and written schemas differ because an + // archaic form for a list was used in the source file. + // assert_eq!(metadata.schema, metadata.schema); // Check that the file was written correctly let (read_record_batches, read_metadata) = From cbf1624e72ac99563b0977ee5cd8eb8c362b0dcd Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 23 Sep 2025 15:23:50 -0700 Subject: [PATCH 100/126] add TODO --- parquet/src/basic.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index ee442d59742a..6556942d25eb 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -63,6 +63,10 @@ enum Type { // // Cannot use macros because of added field `None` +// TODO(ets): Adding the `NONE` variant to this enum is a bit awkward. We should +// look into removing it and using `Option` instead. Then all of this +// handwritten code could go away. + /// Common types (converted types) used by frameworks when using Parquet. /// /// This helps map between types in those frameworks to the base types in Parquet. From a87b0a255d22a5e4572d78853970247bce2c7e9a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 25 Sep 2025 09:29:12 -0700 Subject: [PATCH 101/126] forgot to check this in during merge --- parquet/src/file/metadata/writer.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index c7e1ad8622fe..cf40322def99 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -478,13 +478,6 @@ impl MetadataObjectWriter { object.write_thrift(&mut protocol)?; Ok(()) } - - #[inline] - fn write_thrift_object(object: &impl WriteThrift, sink: impl Write) -> Result<()> { - let mut protocol = ThriftCompactOutputProtocol::new(sink); - object.write_thrift(&mut protocol)?; - Ok(()) - } } /// Implementations of [`MetadataObjectWriter`] methods for when encryption is disabled From 13343700e88fa5f6c1ff894e49da5b70630dee07 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 25 Sep 2025 10:11:29 -0700 Subject: [PATCH 102/126] remove TODO --- parquet/src/file/metadata/writer.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index cf40322def99..6396e454fb09 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -776,8 +776,6 @@ impl MetadataObjectWriter { } let ciphertext = column_encryptor.encrypt(&buffer, &aad)?; - // TODO: remember to not serialize column meta data if encrypted_column_metadata - // is Some column_chunk.encrypted_column_metadata = Some(ciphertext); } } From 5c5c8264373ebd40a08779f047b5c658ba5434e1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 25 Sep 2025 10:23:32 -0700 Subject: [PATCH 103/126] add HeapSize for crypto fields on chunk metadata --- parquet/src/file/column_crypto_metadata.rs | 16 ++++++++++++++++ parquet/src/file/metadata/memory.rs | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/parquet/src/file/column_crypto_metadata.rs b/parquet/src/file/column_crypto_metadata.rs index 6a538bd42bc0..429e7946dd67 100644 --- a/parquet/src/file/column_crypto_metadata.rs +++ b/parquet/src/file/column_crypto_metadata.rs @@ -20,6 +20,7 @@ use std::io::Write; use crate::errors::{ParquetError, Result}; +use crate::file::metadata::HeapSize; use crate::format::{ ColumnCryptoMetaData as TColumnCryptoMetaData, EncryptionWithColumnKey as TEncryptionWithColumnKey, @@ -45,6 +46,12 @@ pub struct EncryptionWithColumnKey { } ); +impl HeapSize for EncryptionWithColumnKey { + fn heap_size(&self) -> usize { + self.path_in_schema.heap_size() + self.key_metadata.heap_size() + } +} + thrift_union!( /// ColumnCryptoMetadata for a column chunk union ColumnCryptoMetaData { @@ -53,6 +60,15 @@ union ColumnCryptoMetaData { } ); +impl HeapSize for ColumnCryptoMetaData { + fn heap_size(&self) -> usize { + match self { + Self::ENCRYPTION_WITH_FOOTER_KEY => 0, + Self::ENCRYPTION_WITH_COLUMN_KEY(path) => path.heap_size(), + } + } +} + /// Converts Thrift definition into `ColumnCryptoMetadata`. pub fn try_from_thrift( thrift_column_crypto_metadata: &TColumnCryptoMetaData, diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 19122a1b5522..6a163c361837 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -92,6 +92,25 @@ impl HeapSize for RowGroupMetaData { } } +#[cfg(feature = "encryption")] +impl HeapSize for ColumnChunkMetaData { + fn heap_size(&self) -> usize { + // don't count column_descr here because it is already counted in + // FileMetaData + self.encodings.heap_size() + + self.file_path.heap_size() + + self.compression.heap_size() + + self.statistics.heap_size() + + self.encoding_stats.heap_size() + + self.unencoded_byte_array_data_bytes.heap_size() + + self.repetition_level_histogram.heap_size() + + self.definition_level_histogram.heap_size() + + self.column_crypto_metadata.heap_size() + + self.encrypted_column_metadata.heap_size() + } +} + +#[cfg(not(feature = "encryption"))] impl HeapSize for ColumnChunkMetaData { fn heap_size(&self) -> usize { // don't count column_descr here because it is already counted in From 1bca0a04e27f40f830000c4c65dabaadd36b7264 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 25 Sep 2025 15:09:11 -0700 Subject: [PATCH 104/126] remove unnecessary checks --- parquet/tests/encryption/encryption_async.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/parquet/tests/encryption/encryption_async.rs b/parquet/tests/encryption/encryption_async.rs index 8d3b1e960be9..6999b1a931f4 100644 --- a/parquet/tests/encryption/encryption_async.rs +++ b/parquet/tests/encryption/encryption_async.rs @@ -819,9 +819,6 @@ async fn test_multi_threaded_encrypted_writing() { // Close the file writer which writes the footer assert_eq!(metadata.file_metadata().num_rows(), 50); - // TODO(ets): what was this meant to test? The read and written schemas differ because an - // archaic form for a list was used in the source file. - // assert_eq!(metadata.schema, metadata.schema); // Check that the file was written correctly let (read_record_batches, read_metadata) = @@ -912,9 +909,6 @@ async fn test_multi_threaded_encrypted_writing_deprecated() { // Close the file writer which writes the footer let metadata = writer.finish().unwrap(); assert_eq!(metadata.file_metadata().num_rows(), 100); - // TODO(ets): what was this meant to test? The read and written schemas differ because an - // archaic form for a list was used in the source file. - // assert_eq!(metadata.schema, metadata.schema); // Check that the file was written correctly let (read_record_batches, read_metadata) = From c3907dc7a16390349e12d1fc03e524486c5d052c Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 26 Sep 2025 08:05:37 -0700 Subject: [PATCH 105/126] implement suggestions from review --- parquet/benches/metadata.rs | 2 + parquet/src/file/metadata/memory.rs | 24 +++------ parquet/src/file/metadata/thrift_gen.rs | 71 +++++++++---------------- 3 files changed, 32 insertions(+), 65 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index 1992a6868f43..d05f1e09cb11 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -219,6 +219,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); + // FIXME(ets): remove benches of private APIs c.bench_function("decode thrift file metadata", |b| { b.iter(|| { parquet::thrift::bench_file_metadata(&meta_data); @@ -239,6 +240,7 @@ fn criterion_benchmark(c: &mut Criterion) { }); // rewrite file with page statistics. then read page headers. + // FIXME(ets): remove the page header benches when remodel is complete #[cfg(feature = "arrow")] let (file_bytes, metadata) = rewrite_file(data.clone()); #[cfg(feature = "arrow")] diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 6a163c361837..bfe6b0255c5c 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -92,27 +92,14 @@ impl HeapSize for RowGroupMetaData { } } -#[cfg(feature = "encryption")] impl HeapSize for ColumnChunkMetaData { fn heap_size(&self) -> usize { - // don't count column_descr here because it is already counted in - // FileMetaData - self.encodings.heap_size() - + self.file_path.heap_size() - + self.compression.heap_size() - + self.statistics.heap_size() - + self.encoding_stats.heap_size() - + self.unencoded_byte_array_data_bytes.heap_size() - + self.repetition_level_histogram.heap_size() - + self.definition_level_histogram.heap_size() - + self.column_crypto_metadata.heap_size() - + self.encrypted_column_metadata.heap_size() - } -} + #[cfg(feature = "encryption")] + let encryption_heap_size = + self.column_crypto_metadata.heap_size() + self.encrypted_column_metadata.heap_size(); + #[cfg(not(feature = "encryption"))] + let encryption_heap_size = 0; -#[cfg(not(feature = "encryption"))] -impl HeapSize for ColumnChunkMetaData { - fn heap_size(&self) -> usize { // don't count column_descr here because it is already counted in // FileMetaData self.encodings.heap_size() @@ -123,6 +110,7 @@ impl HeapSize for ColumnChunkMetaData { + self.unencoded_byte_array_data_bytes.heap_size() + self.repetition_level_histogram.heap_size() + self.definition_level_histogram.heap_size() + + encryption_heap_size } } diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 200d61d2e300..5665ad2ce9ca 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -1265,7 +1265,6 @@ pub(crate) struct FileMeta<'a> { impl<'a> WriteThrift for FileMeta<'a> { const ELEMENT_TYPE: ElementType = ElementType::Struct; - #[allow(unused_assignments)] fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { self.file_metadata .version @@ -1361,7 +1360,7 @@ fn write_schema( type_length: None, repetition_type: repetition, name: basic_info.name(), - num_children: Some(fields.len() as i32), + num_children: Some(fields.len().try_into()?), converted_type: match basic_info.converted_type() { ConvertedType::NONE => None, other => Some(other), @@ -1432,10 +1431,10 @@ impl WriteThrift for RowGroupMetaData { // 8: optional ColumnCryptoMetaData crypto_metadata // 9: optional binary encrypted_column_metadata // } -#[cfg(feature = "encryption")] impl WriteThrift for ColumnChunkMetaData { const ELEMENT_TYPE: ElementType = ElementType::Struct; + #[allow(unused_assignments)] fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; if let Some(file_path) = self.file_path() { @@ -1445,8 +1444,18 @@ impl WriteThrift for ColumnChunkMetaData { .file_offset() .write_thrift_field(writer, 2, last_field_id)?; - // only write the ColumnMetaData if we haven't already encrypted it - if self.encrypted_column_metadata.is_none() { + #[cfg(feature = "encryption")] + { + // only write the ColumnMetaData if we haven't already encrypted it + if self.encrypted_column_metadata.is_none() { + writer.write_field_begin(FieldType::Struct, 3, last_field_id)?; + serialize_column_meta_data(self, writer)?; + last_field_id = 3; + } + } + #[cfg(not(feature = "encryption"))] + { + // always write the ColumnMetaData writer.write_field_begin(FieldType::Struct, 3, last_field_id)?; serialize_column_meta_data(self, writer)?; last_field_id = 3; @@ -1464,48 +1473,16 @@ impl WriteThrift for ColumnChunkMetaData { if let Some(column_idx_len) = self.column_index_length() { last_field_id = column_idx_len.write_thrift_field(writer, 7, last_field_id)?; } - if let Some(crypto_metadata) = self.crypto_metadata() { - last_field_id = crypto_metadata.write_thrift_field(writer, 8, last_field_id)?; - } - if let Some(encrypted_meta) = self.encrypted_column_metadata.as_ref() { - encrypted_meta - .as_slice() - .write_thrift_field(writer, 9, last_field_id)?; - } - - writer.write_struct_end() - } -} - -#[cfg(not(feature = "encryption"))] -impl WriteThrift for ColumnChunkMetaData { - const ELEMENT_TYPE: ElementType = ElementType::Struct; - - fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { - let mut last_field_id = 0i16; - if let Some(file_path) = self.file_path() { - last_field_id = file_path.write_thrift_field(writer, 1, last_field_id)?; - } - last_field_id = self - .file_offset() - .write_thrift_field(writer, 2, last_field_id)?; - - // always write the ColumnMetaData - writer.write_field_begin(FieldType::Struct, 3, last_field_id)?; - serialize_column_meta_data(self, writer)?; - last_field_id = 3; - - if let Some(offset_idx_off) = self.offset_index_offset() { - last_field_id = offset_idx_off.write_thrift_field(writer, 4, last_field_id)?; - } - if let Some(offset_idx_len) = self.offset_index_length() { - last_field_id = offset_idx_len.write_thrift_field(writer, 5, last_field_id)?; - } - if let Some(column_idx_off) = self.column_index_offset() { - last_field_id = column_idx_off.write_thrift_field(writer, 6, last_field_id)?; - } - if let Some(column_idx_len) = self.column_index_length() { - column_idx_len.write_thrift_field(writer, 7, last_field_id)?; + #[cfg(feature = "encryption")] + { + if let Some(crypto_metadata) = self.crypto_metadata() { + last_field_id = crypto_metadata.write_thrift_field(writer, 8, last_field_id)?; + } + if let Some(encrypted_meta) = self.encrypted_column_metadata.as_ref() { + encrypted_meta + .as_slice() + .write_thrift_field(writer, 9, last_field_id)?; + } } writer.write_struct_end() From 653fa1a0b2a5a13d4392775f15b5866a83605811 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 26 Sep 2025 08:29:10 -0700 Subject: [PATCH 106/126] remove TODO --- parquet/src/file/metadata/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index c2f7062dffc4..e46a049bc5e4 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -423,7 +423,6 @@ impl From for ParquetMetaDataBuilder { } } -// TODO(ets): should this move to thrift_gen? thrift_struct!( /// A key-value pair for [`FileMetaData`]. pub struct KeyValue { From 91e3df75ca9857d77f7577a5cd9a673b67499bed Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 26 Sep 2025 08:31:28 -0700 Subject: [PATCH 107/126] more todos --- parquet/src/file/metadata/writer.rs | 2 +- parquet/src/thrift.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 12c99cd7fad3..593ac95c78bb 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -431,7 +431,7 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> { } fn convert_column_indexes(&self) -> Option>>> { - // FIXME(ets): we're converting from ParquetColumnIndex to vec>, + // TODO(ets): we're converting from ParquetColumnIndex to vec>, // but then converting back to ParquetColumnIndex in the end. need to unify this. self.metadata .column_index() diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index ff9e9a39aac1..b7d327504862 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -30,6 +30,7 @@ pub trait TSerializable: Sized { fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()>; } +// TODO(ets): remove the next 3 functions when the Thrift remodel is complete // Public function to aid benchmarking. Reads Parquet `FileMetaData` encoded in `bytes`. #[doc(hidden)] pub fn bench_file_metadata(bytes: &bytes::Bytes) { From 4b8c68b10717758d84ec7da800323de8bfd152d5 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 26 Sep 2025 09:36:49 -0700 Subject: [PATCH 108/126] variant logical type fixes --- parquet/src/arrow/schema/extension.rs | 4 ++-- parquet/src/variant.rs | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/parquet/src/arrow/schema/extension.rs b/parquet/src/arrow/schema/extension.rs index 752b9a5ced87..5419e5d6ddf6 100644 --- a/parquet/src/arrow/schema/extension.rs +++ b/parquet/src/arrow/schema/extension.rs @@ -37,7 +37,7 @@ use arrow_schema::Field; pub(crate) fn add_extension_type(mut arrow_field: Field, parquet_type: &Type) -> Field { match parquet_type.get_basic_info().logical_type() { #[cfg(feature = "variant_experimental")] - Some(LogicalType::Variant) => { + Some(LogicalType::Variant{..}) => { // try to add the Variant extension type, but if that fails (e.g. because the // storage type is not supported), just return the field as is arrow_field @@ -60,7 +60,7 @@ pub(crate) fn logical_type_for_struct(field: &Field) -> Option { return None; } match field.try_extension_type::() { - Ok(VariantType) => Some(LogicalType::Variant), + Ok(VariantType) => Some(LogicalType::Variant{ specification_version: None }), // Given check above, this should not error, but if it does ignore Err(_e) => None, } diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs index 497d1dc6c4f3..bc52a2b41d13 100644 --- a/parquet/src/variant.rs +++ b/parquet/src/variant.rs @@ -202,7 +202,9 @@ mod tests { // data should have been written with the Variant logical type assert_eq!( field.get_basic_info().logical_type(), - Some(crate::basic::LogicalType::Variant) + Some(crate::basic::LogicalType::Variant { + specification_version: None + }) ); } From 80fc032cdf232a6b9116ab410ce681b85e1307a5 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 26 Sep 2025 10:02:10 -0700 Subject: [PATCH 109/126] remove lint --- parquet/src/arrow/schema/extension.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/schema/extension.rs b/parquet/src/arrow/schema/extension.rs index 5419e5d6ddf6..ec6527672b5c 100644 --- a/parquet/src/arrow/schema/extension.rs +++ b/parquet/src/arrow/schema/extension.rs @@ -37,7 +37,7 @@ use arrow_schema::Field; pub(crate) fn add_extension_type(mut arrow_field: Field, parquet_type: &Type) -> Field { match parquet_type.get_basic_info().logical_type() { #[cfg(feature = "variant_experimental")] - Some(LogicalType::Variant{..}) => { + Some(LogicalType::Variant { .. }) => { // try to add the Variant extension type, but if that fails (e.g. because the // storage type is not supported), just return the field as is arrow_field @@ -60,7 +60,9 @@ pub(crate) fn logical_type_for_struct(field: &Field) -> Option { return None; } match field.try_extension_type::() { - Ok(VariantType) => Some(LogicalType::Variant{ specification_version: None }), + Ok(VariantType) => Some(LogicalType::Variant { + specification_version: None, + }), // Given check above, this should not error, but if it does ignore Err(_e) => None, } From 61773a0ca67070ab358c6c22bfaac711e441ae1e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 26 Sep 2025 10:38:53 -0700 Subject: [PATCH 110/126] remove private APIs from metadata benchmark --- parquet/benches/metadata.rs | 93 ++++++------------------------------- parquet/src/thrift.rs | 14 ------ 2 files changed, 14 insertions(+), 93 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index 8c886e4d5eea..b05d6c33a96a 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -86,13 +86,13 @@ fn encoded_meta() -> Vec { encodings: vec![Encoding::PLAIN, Encoding::RLE_DICTIONARY], path_in_schema: vec![], codec: CompressionCodec::UNCOMPRESSED, - num_values: rng.random(), - total_uncompressed_size: rng.random(), - total_compressed_size: rng.random(), + num_values: rng.random_range(1..1000000), + total_uncompressed_size: rng.random_range(100000..100000000), + total_compressed_size: rng.random_range(50000..5000000), key_value_metadata: None, - data_page_offset: rng.random(), - index_page_offset: Some(rng.random()), - dictionary_page_offset: Some(rng.random()), + data_page_offset: rng.random_range(4..2000000000), + index_page_offset: None, + dictionary_page_offset: Some(rng.random_range(4..2000000000)), statistics: Some(stats.clone()), encoding_stats: Some(vec![ PageEncodingStats { @@ -111,10 +111,10 @@ fn encoded_meta() -> Vec { size_statistics: None, geospatial_statistics: None, }), - offset_index_offset: Some(rng.random()), - offset_index_length: Some(rng.random()), - column_index_offset: Some(rng.random()), - column_index_length: Some(rng.random()), + offset_index_offset: Some(rng.random_range(0..2000000000)), + offset_index_length: Some(rng.random_range(1..100000)), + column_index_offset: Some(rng.random_range(0..2000000000)), + column_index_length: Some(rng.random_range(1..100000)), crypto_metadata: None, encrypted_column_metadata: None, }) @@ -122,11 +122,11 @@ fn encoded_meta() -> Vec { RowGroup { columns, - total_byte_size: rng.random(), - num_rows: rng.random(), + total_byte_size: rng.random_range(1..2000000000), + num_rows: rng.random_range(1..10000000000), sorting_columns: None, file_offset: None, - total_compressed_size: Some(rng.random()), + total_compressed_size: Some(rng.random_range(1..1000000000)), ordinal: Some(i as _), } }) @@ -136,7 +136,7 @@ fn encoded_meta() -> Vec { schema, row_groups, version: 1, - num_rows: rng.random(), + num_rows: rng.random_range(1..2000000000), key_value_metadata: None, created_by: Some("parquet-rs".into()), column_orders: None, @@ -163,36 +163,6 @@ fn get_footer_bytes(data: Bytes) -> Bytes { data.slice(meta_start..meta_end) } -#[cfg(feature = "arrow")] -fn rewrite_file(bytes: Bytes) -> (Bytes, FileMetaData) { - use arrow::array::RecordBatchReader; - use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter}; - use parquet::file::properties::{EnabledStatistics, WriterProperties}; - - let parquet_reader = ParquetRecordBatchReaderBuilder::try_new(bytes) - .expect("parquet open") - .build() - .expect("parquet open"); - let writer_properties = WriterProperties::builder() - .set_statistics_enabled(EnabledStatistics::Page) - .set_write_page_header_statistics(true) - .build(); - let mut output = Vec::new(); - let mut parquet_writer = ArrowWriter::try_new( - &mut output, - parquet_reader.schema(), - Some(writer_properties), - ) - .expect("create arrow writer"); - - for maybe_batch in parquet_reader { - let batch = maybe_batch.expect("reading batch"); - parquet_writer.write(&batch).expect("writing data"); - } - let file_meta = parquet_writer.close().expect("finalizing file"); - (output.into(), file_meta) -} - fn criterion_benchmark(c: &mut Criterion) { // Read file into memory to isolate filesystem performance let file = "../parquet-testing/data/alltypes_tiny_pages.parquet"; @@ -217,47 +187,12 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("decode thrift file metadata", |b| { - b.iter(|| { - parquet::thrift::bench_file_metadata(&meta_data); - }) - }); - let buf: Bytes = black_box(encoded_meta()).into(); c.bench_function("decode parquet metadata (wide)", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata(&buf).unwrap(); }) }); - - c.bench_function("decode thrift file metadata (wide)", |b| { - b.iter(|| { - parquet::thrift::bench_file_metadata(&buf); - }) - }); - - // rewrite file with page statistics. then read page headers. - #[cfg(feature = "arrow")] - let (file_bytes, metadata) = rewrite_file(data.clone()); - #[cfg(feature = "arrow")] - c.bench_function("page headers", |b| { - b.iter(|| { - metadata.row_groups.iter().for_each(|rg| { - rg.columns.iter().for_each(|col| { - if let Some(col_meta) = &col.meta_data { - if let Some(dict_offset) = col_meta.dictionary_page_offset { - parquet::thrift::bench_page_header( - &file_bytes.slice(dict_offset as usize..), - ); - } - parquet::thrift::bench_page_header( - &file_bytes.slice(col_meta.data_page_offset as usize..), - ); - } - }); - }); - }) - }); } criterion_group!(benches, criterion_benchmark); diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index e16e394be2bb..1cbd47a90001 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -33,20 +33,6 @@ pub trait TSerializable: Sized { fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()>; } -// Public function to aid benchmarking. Reads Parquet `FileMetaData` encoded in `bytes`. -#[doc(hidden)] -pub fn bench_file_metadata(bytes: &bytes::Bytes) { - let mut input = TCompactSliceInputProtocol::new(bytes); - crate::format::FileMetaData::read_from_in_protocol(&mut input).unwrap(); -} - -// Public function to aid benchmarking. Reads Parquet `PageHeader` encoded in `bytes`. -#[doc(hidden)] -pub fn bench_page_header(bytes: &bytes::Bytes) { - let mut prot = TCompactSliceInputProtocol::new(bytes); - crate::format::PageHeader::read_from_in_protocol(&mut prot).unwrap(); -} - /// A more performant implementation of [`TCompactInputProtocol`] that reads a slice /// /// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol From a6a6326a8fc8d154dda26989e67767359c6d2cd5 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 29 Sep 2025 07:58:17 -0700 Subject: [PATCH 111/126] Apply suggestions from code review Co-authored-by: Matthijs Brobbel --- parquet/THRIFT.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/parquet/THRIFT.md b/parquet/THRIFT.md index 642040454f74..032c6e1f78e7 100644 --- a/parquet/THRIFT.md +++ b/parquet/THRIFT.md @@ -22,7 +22,7 @@ For both performance and flexibility reasons, this crate uses custom Thrift parsers and serialization mechanisms. For many of the objects defined by the Parquet specification macros are used to generate the objects as well as the code to serialize them. But in certain instances -(performance bottlenecks, additions to the spec, etc.),it becomes necessary to implement the +(performance bottlenecks, additions to the spec, etc.), it becomes necessary to implement the serialization code manually. This document serves to document both the standard usage of the Thrift macros, as well as how to implement custom encoders and decoders. @@ -31,7 +31,7 @@ Thrift macros, as well as how to implement custom encoders and decoders. The Parquet specification utilizes Thrift enums, unions, and structs, defined by an Interface Description Language (IDL). This IDL is usually parsed by a Thrift code generator to produce language specific structures and serialization/deserialization code. This crate, however, uses -Rust macros do perform the same function. This allows for customizations that produce more +Rust macros to perform the same function. This allows for customizations that produce more performant code, as well as the ability to pick and choose which fields to process. ### Enums @@ -149,7 +149,7 @@ union ColumnCryptoMetaData { } ``` -The `ENCRYPTION_WITH_FOOTER_KEY` variant is types with an empty struct, while +The `ENCRYPTION_WITH_FOOTER_KEY` variant is typed with an empty struct, while `ENCRYPTION_WITH_COLUMN_KEY` has the type of a struct with fields. In this case, the `thrift_union` macro is used. @@ -266,21 +266,21 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ConvertedType 2 => Self::MAP_KEY_VALUE, ... 21 => Self::INTERVAL, - _ => return Err(general_err!("Unexpected ConvertedType {}", val)), + _ => return Err(general_err!("Unexpected ConvertedType {val}")), }) } } ``` The default behavior is to return an error when an unexpected field is encountered. One could, -however, provide an `Unknown` variant if forwards compatibility is neeeded in the case of an +however, provide an `Unknown` variant if forward compatibility is neeeded in the case of an evolving enum. Deserializing structs is more involved, but still fairly easy. A thrift struct is serialized as repeated `(field_id,field_type,field)` tuples. The `field_id` and `field_type` usually occupy a single byte, followed by the Thrift encoded field. Because only 4 bits are available for the id, encoders usually will instead use deltas from the preceding field. If the delta will exceed 15, -then the `field_id` nybble will be set to `0`, and the `field_id` will instead be encoded as a +then the `field_id` nibble will be set to `0`, and the `field_id` will instead be encoded as a varint, following the `field_type`. Fields will generally be read in a loop, with the `field_id` and `field_type` read first, and then the `field_id` used to determine which field to read. When a `field_id` of `0` is encountered, this marks the end of the struct and processing ceases. @@ -341,7 +341,7 @@ the decoding of the `LogicalType` union: } ... _ => { - // LogicalType needs to be forward compatible, so we have defined a `_Unknown` + // LogicalType needs to be forward compatible, so we have defined an `_Unknown` // variant for it. This can return an error if forward compatibility is not desired. prot.skip(field_ident.field_type)?; Self::_Unknown { @@ -364,7 +364,7 @@ On the serialization side, there are two traits to implement. The first, `WriteT for actually serializing the object. The other, `WriteThriftField`, handles serializing objects as struct fields. -Serializing enums is a simple as writing the discriminant as an `i32`. For example, here is the +Serializing enums is as simple as writing the discriminant as an `i32`. For example, here is the custom serialization code for `ConvertedType`: ```rust @@ -425,7 +425,7 @@ that have been encoded, and then read elements one at a time. } ``` -Writing is simply the reverse...write the list header, and then serialize the elements: +Writing is simply the reverse: write the list header, and then serialize the elements: ```rust // write the list header From b5651e5d49d8048c79ff77f50b0428451a31841b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 29 Sep 2025 08:27:39 -0700 Subject: [PATCH 112/126] add test of invalid converted type --- parquet/src/basic.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 6556942d25eb..bdd452e6ef0f 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -1365,7 +1365,7 @@ impl str::FromStr for LogicalType { #[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module mod tests { use super::*; - use crate::parquet_thrift::tests::test_roundtrip; + use crate::parquet_thrift::{tests::test_roundtrip, ThriftSliceInputProtocol}; #[test] fn test_display_type() { @@ -1448,6 +1448,17 @@ mod tests { test_roundtrip(ConvertedType::INTERVAL); } + #[test] + fn test_read_invalid_converted_type() { + let mut prot = ThriftSliceInputProtocol::new(&[0x7eu8]); + let res = ConvertedType::read_thrift(&mut prot); + assert!(res.is_err()); + assert_eq!( + res.unwrap_err().to_string(), + "Parquet error: Unexpected ConvertedType 63" + ); + } + #[test] fn test_display_converted_type() { assert_eq!(ConvertedType::NONE.to_string(), "NONE"); From 282a9254fb9fceccbfad13e68e38280693f09ca1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 29 Sep 2025 08:31:34 -0700 Subject: [PATCH 113/126] use raw identifier for 'type' in SchemaElement --- parquet/THRIFT.md | 6 +++--- parquet/src/file/metadata/thrift_gen.rs | 6 +++--- parquet/src/schema/types.rs | 3 +-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/parquet/THRIFT.md b/parquet/THRIFT.md index 032c6e1f78e7..1079cd8e6964 100644 --- a/parquet/THRIFT.md +++ b/parquet/THRIFT.md @@ -184,7 +184,7 @@ This is defined in this crate as ```rust thrift_struct!( pub(crate) struct SchemaElement<'a> { - 1: optional Type type_; + 1: optional Type r#type; 2: optional i32 type_length; 3: optional Repetition repetition_type; 4: required string<'a> name; @@ -205,7 +205,7 @@ resultant Rust struct will be ```rust pub(crate) struct SchemaElement<'a> { - pub(crate) type_: Type, // here we've changed the name `type` to `type_` to avoid reserved words + pub(crate) r#type: Type, // here we've changed the name `type` to `r#type` to avoid reserved words pub(crate) type_length: i32, pub(crate) repetition_type: Repetition, pub(crate) name: &'a str, @@ -266,7 +266,7 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ConvertedType 2 => Self::MAP_KEY_VALUE, ... 21 => Self::INTERVAL, - _ => return Err(general_err!("Unexpected ConvertedType {val}")), + _ => return Err(general_err!("Unexpected ConvertedType {}", val)), }) } } diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 0ed8adf3b923..80caedcda0b1 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -55,7 +55,7 @@ use crate::{ thrift_struct!( pub(crate) struct SchemaElement<'a> { /** Data type for this field. Not set if the current element is a non-leaf node */ - 1: optional Type type_; + 1: optional Type r#type; 2: optional i32 type_length; 3: optional Repetition repetition_type; 4: required string<'a> name; @@ -1330,7 +1330,7 @@ fn write_schema_helper( precision, } => { let element = SchemaElement { - type_: Some(*physical_type), + r#type: Some(*physical_type), type_length: if *type_length >= 0 { Some(*type_length) } else { @@ -1366,7 +1366,7 @@ fn write_schema_helper( }; let element = SchemaElement { - type_: None, + r#type: None, type_length: None, repetition_type: repetition, name: basic_info.name(), diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 44d9058abf11..3ca062990f06 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1299,8 +1299,7 @@ fn schema_from_array_helper<'a>( )); } let repetition = element.repetition_type.unwrap(); - if let Some(type_) = element.type_ { - let physical_type = type_; + if let Some(physical_type) = element.r#type { let length = element.type_length.unwrap_or(-1); let scale = element.scale.unwrap_or(-1); let precision = element.precision.unwrap_or(-1); From e623a565018f2c90e75a67eb7cc86c47dc562060 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 29 Sep 2025 15:26:41 -0700 Subject: [PATCH 114/126] finish merge --- parquet/src/file/metadata/mod.rs | 2 +- parquet/src/geospatial/bounding_box.rs | 17 ------------ parquet/src/geospatial/statistics.rs | 36 +------------------------- 3 files changed, 2 insertions(+), 53 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 7e1db69c473c..8947e717c14d 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1850,7 +1850,7 @@ mod tests { #[cfg(not(feature = "encryption"))] let bigger_expected_size = 2706; #[cfg(feature = "encryption")] - let bigger_expected_size = 3138; + let bigger_expected_size = 3170; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); diff --git a/parquet/src/geospatial/bounding_box.rs b/parquet/src/geospatial/bounding_box.rs index aa6798eb8da2..ce23696afcf3 100644 --- a/parquet/src/geospatial/bounding_box.rs +++ b/parquet/src/geospatial/bounding_box.rs @@ -21,7 +21,6 @@ //! Derived from the parquet format spec: //! //! -use crate::format as parquet; /// A geospatial instance has at least two coordinate dimensions: X and Y for 2D coordinates of each point. /// X represents longitude/easting and Y represents latitude/northing. A geospatial instance can optionally @@ -171,22 +170,6 @@ impl BoundingBox { } } -impl From for parquet::BoundingBox { - /// Converts our internal `BoundingBox` to the Thrift-generated format. - fn from(b: BoundingBox) -> parquet::BoundingBox { - parquet::BoundingBox { - xmin: b.x_range.0.into(), - xmax: b.x_range.1.into(), - ymin: b.y_range.0.into(), - ymax: b.y_range.1.into(), - zmin: b.z_range.map(|z| z.0.into()), - zmax: b.z_range.map(|z| z.1.into()), - mmin: b.m_range.map(|m| m.0.into()), - mmax: b.m_range.map(|m| m.1.into()), - } - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/parquet/src/geospatial/statistics.rs b/parquet/src/geospatial/statistics.rs index 6d7cd030f433..2e99d9c62aff 100644 --- a/parquet/src/geospatial/statistics.rs +++ b/parquet/src/geospatial/statistics.rs @@ -20,7 +20,6 @@ //! This module provides functionality for working with geospatial statistics in Parquet files. //! It includes support for bounding boxes and geospatial statistics in column chunk metadata. -use crate::format::GeospatialStatistics as TGeospatialStatistics; use crate::geospatial::bounding_box::BoundingBox; // ---------------------------------------------------------------------- @@ -70,44 +69,11 @@ impl GeospatialStatistics { } } -/// Converts our internal geospatial statistics to the Thrift-generated format. -pub fn to_thrift(geo_statistics: Option<&GeospatialStatistics>) -> Option { - let geo_stats = geo_statistics?; - let bbox = geo_stats.bbox.clone().map(|bbox| bbox.into()); - let geospatial_types = geo_stats.geospatial_types.clone(); - Some(TGeospatialStatistics::new(bbox, geospatial_types)) -} - #[cfg(test)] mod tests { use super::*; - #[test] - fn test_bbox_to_thrift() { - use crate::format as parquet; - use thrift::OrderedFloat; - - let bbox = BoundingBox::new(0.0, 0.0, 100.0, 100.0); - let thrift_bbox: parquet::BoundingBox = bbox.into(); - assert_eq!(thrift_bbox.xmin, 0.0); - assert_eq!(thrift_bbox.xmax, 0.0); - assert_eq!(thrift_bbox.ymin, 100.0); - assert_eq!(thrift_bbox.ymax, 100.0); - assert_eq!(thrift_bbox.zmin, None); - assert_eq!(thrift_bbox.zmax, None); - assert_eq!(thrift_bbox.mmin, None); - assert_eq!(thrift_bbox.mmax, None); - - let bbox_z = BoundingBox::new(0.0, 0.0, 100.0, 100.0).with_zrange(5.0, 15.0); - let thrift_bbox_z: parquet::BoundingBox = bbox_z.into(); - assert_eq!(thrift_bbox_z.zmin, Some(OrderedFloat(5.0))); - assert_eq!(thrift_bbox_z.zmax, Some(OrderedFloat(15.0))); - - let bbox_m = BoundingBox::new(0.0, 0.0, 100.0, 100.0).with_mrange(10.0, 20.0); - let thrift_bbox_m: parquet::BoundingBox = bbox_m.into(); - assert_eq!(thrift_bbox_m.mmin, Some(OrderedFloat(10.0))); - assert_eq!(thrift_bbox_m.mmax, Some(OrderedFloat(20.0))); - } + // TODO(ets): add round trip to/from parquet tests #[test] fn test_read_geospatial_statistics_from_file() { From f6be170bc5432ffcb6ff491c22e6228cf4035cdd Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 29 Sep 2025 15:32:48 -0700 Subject: [PATCH 115/126] fix test --- parquet/src/file/metadata/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 8947e717c14d..abf0542c969a 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1848,7 +1848,7 @@ mod tests { .build(); #[cfg(not(feature = "encryption"))] - let bigger_expected_size = 2706; + let bigger_expected_size = 2738; #[cfg(feature = "encryption")] let bigger_expected_size = 3170; From 8454d5083832d83688ec49da21667f578a28cd19 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 29 Sep 2025 16:05:31 -0700 Subject: [PATCH 116/126] some doc changes from review comments --- parquet/THRIFT.md | 5 +++-- parquet/src/file/page_index/column_index.rs | 7 ++++++- parquet/src/file/page_index/offset_index.rs | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/parquet/THRIFT.md b/parquet/THRIFT.md index 1079cd8e6964..06e97709cce3 100644 --- a/parquet/THRIFT.md +++ b/parquet/THRIFT.md @@ -31,8 +31,9 @@ Thrift macros, as well as how to implement custom encoders and decoders. The Parquet specification utilizes Thrift enums, unions, and structs, defined by an Interface Description Language (IDL). This IDL is usually parsed by a Thrift code generator to produce language specific structures and serialization/deserialization code. This crate, however, uses -Rust macros to perform the same function. This allows for customizations that produce more -performant code, as well as the ability to pick and choose which fields to process. +Rust macros to perform the same function. In addition to skipping creation of additional duplicate +structures, doing so allows for customizations that produce more performant code, as well as the +ability to pick and choose which fields to process. ### Enums diff --git a/parquet/src/file/page_index/column_index.rs b/parquet/src/file/page_index/column_index.rs index 6b5e9eb946a1..2aa155a2825d 100644 --- a/parquet/src/file/page_index/column_index.rs +++ b/parquet/src/file/page_index/column_index.rs @@ -499,7 +499,12 @@ macro_rules! colidx_enum_func { }}; } -/// index +/// Parsed [`ColumnIndex`] information for a Parquet file. +/// +/// See [`ParquetColumnIndex`] for more information. +/// +/// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex +/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md #[derive(Debug, Clone, PartialEq)] #[allow(non_camel_case_types)] pub enum ColumnIndexMetaData { diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 6f04f39c98d5..d79da37824c8 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -48,6 +48,9 @@ thrift_struct!( /// [`OffsetIndex`] information for a column chunk. Contains offsets and sizes for each page /// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY columns. /// +/// See [`ParquetOffsetIndex`] for more information. +/// +/// [`ParquetOffsetIndex`]: crate::file::metadata::ParquetOffsetIndex /// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub struct OffsetIndexMetaData { /// Vector of [`PageLocation`] objects, one per page in the chunk. From cba5d3d3f5247f187932e3cbb7f7ea1c1629fe3c Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 30 Sep 2025 12:21:16 -0700 Subject: [PATCH 117/126] rename more type_ fields as r#type --- parquet/src/column/page.rs | 4 ++-- parquet/src/column/page_encryption.rs | 4 ++-- parquet/src/file/metadata/thrift_gen.rs | 8 ++++---- parquet/src/file/serialized_reader.rs | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 23bf4548fbb4..09125eaabf02 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -218,7 +218,7 @@ impl CompressedPage { let page_type = self.page_type(); let mut page_header = PageHeader { - type_: page_type, + r#type: page_type, uncompressed_page_size: uncompressed_size as i32, compressed_page_size: compressed_size as i32, // TODO: Add support for crc checksum @@ -351,7 +351,7 @@ impl TryFrom<&crate::file::metadata::thrift_gen::PageHeader> for PageMetadata { fn try_from( value: &crate::file::metadata::thrift_gen::PageHeader, ) -> std::result::Result { - match value.type_ { + match value.r#type { PageType::DATA_PAGE => { let header = value.data_page_header.as_ref().unwrap(); Ok(PageMetadata { diff --git a/parquet/src/column/page_encryption.rs b/parquet/src/column/page_encryption.rs index 7ee367a289c8..2486c2c289c4 100644 --- a/parquet/src/column/page_encryption.rs +++ b/parquet/src/column/page_encryption.rs @@ -95,14 +95,14 @@ impl PageEncryptor { page_header: &PageHeader, sink: &mut W, ) -> Result<()> { - let module_type = match page_header.type_ { + let module_type = match page_header.r#type { PageType::DATA_PAGE => ModuleType::DataPageHeader, PageType::DATA_PAGE_V2 => ModuleType::DataPageHeader, PageType::DICTIONARY_PAGE => ModuleType::DictionaryPageHeader, _ => { return Err(general_err!( "Unsupported page type for page header encryption: {:?}", - page_header.type_ + page_header.r#type )) } }; diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 921a212a93a3..632c9b007617 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -177,7 +177,7 @@ struct ColumnChunk<'a> { type CompressionCodec = Compression; thrift_struct!( struct ColumnMetaData<'a> { - 1: required Type type_ + 1: required Type r#type 2: required list encodings // we don't expose path_in_schema so skip //3: required list path_in_schema @@ -306,7 +306,7 @@ fn convert_column( return Err(general_err!("Expected to have column metadata")); } let col_metadata = column.meta_data.unwrap(); - let column_type = col_metadata.type_; + let column_type = col_metadata.r#type; let encodings = col_metadata.encodings; let compression = col_metadata.codec; let file_path = column.file_path.map(|v| v.to_owned()); @@ -1095,7 +1095,7 @@ impl DataPageHeaderV2 { thrift_struct!( pub(crate) struct PageHeader { /// the type of the page: indicates which of the *_header fields is set - 1: required PageType type_ + 1: required PageType r#type /// Uncompressed page size in bytes (not including this header) 2: required i32 uncompressed_page_size @@ -1191,7 +1191,7 @@ impl PageHeader { )); }; Ok(Self { - type_, + r#type: type_, uncompressed_page_size, compressed_page_size, crc, diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index b0d64ea76017..c47c118e43bb 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -413,7 +413,7 @@ pub(crate) fn decode_page( _ => buffer, }; - let result = match page_header.type_ { + let result = match page_header.r#type { PageType::DICTIONARY_PAGE => { let dict_header = page_header.dictionary_page_header.as_ref().ok_or_else(|| { ParquetError::General("Missing dictionary page header".to_string()) @@ -458,7 +458,7 @@ pub(crate) fn decode_page( } _ => { // For unknown page type (e.g., INDEX_PAGE), skip and read next. - unimplemented!("Page type {:?} is not supported", page_header.type_) + unimplemented!("Page type {:?} is not supported", page_header.r#type) } }; @@ -894,7 +894,7 @@ impl PageReader for SerializedPageReader { *offset += data_len as u64; *remaining -= data_len as u64; - if header.type_ == PageType::INDEX_PAGE { + if header.r#type == PageType::INDEX_PAGE { continue; } From e58c9552783ac2a3a6f0003a2ce0d2fbc3c6e0b2 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 30 Sep 2025 13:09:25 -0700 Subject: [PATCH 118/126] clean up parquet_thrift --- parquet/src/parquet_thrift.rs | 36 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index eb66d5376e40..e27c7d16efdb 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -15,10 +15,16 @@ // specific language governing permissions and limitations // under the License. -//! experimental replacement for thrift decoder -// this is a copy of TCompactSliceInputProtocol, but modified -// to not allocate byte arrays or strings. -#![allow(dead_code)] +//! Structs used for encoding and decoding Parquet Thrift objects. +//! +//! These include: +//! * [`ThriftCompactInputProtocol`]: Trait implemented by Thrift decoders. +//! * [`ThriftSliceInputProtocol`]: Thrift decoder that takes a slice of bytes as input. +//! * [`ThriftReadInputProtocol`]: Thrift decoder that takes a [`Read`] as input. +//! * [`ReadThrift`]: Trait implemented by serializable objects. +//! * [`ThriftCompactOutputProtocol`]: Thrift encoder. +//! * [`WriteThrift`]: Trait implemented by serializable objects. +//! * [`WriteThriftField`]: Trait implemented by serializable objects that are fields in Thrift structs. use std::{ cmp::Ordering, @@ -438,11 +444,6 @@ impl<'a> ThriftSliceInputProtocol<'a> { Self { buf } } - /// Re-initialize this reader with a new slice. - pub fn reset_buffer(&mut self, buf: &'a [u8]) { - self.buf = buf; - } - /// Return the current buffer as a slice. pub fn as_slice(&self) -> &'a [u8] { self.buf @@ -638,11 +639,6 @@ impl ThriftCompactOutputProtocol { Self { writer } } - /// Return a reference to the underlying `Write`. - pub(crate) fn inner(&self) -> &W { - &self.writer - } - /// Write a single byte to the output stream. fn write_byte(&mut self, b: u8) -> Result<()> { self.writer.write_all(&[b])?; @@ -1077,13 +1073,13 @@ pub(crate) mod tests { where T: for<'a> ReadThrift<'a, ThriftSliceInputProtocol<'a>> + WriteThrift + PartialEq + Debug, { - let buf = Vec::::new(); - let mut writer = ThriftCompactOutputProtocol::new(buf); - val.write_thrift(&mut writer).unwrap(); - - //println!("serialized: {:x?}", writer.inner()); + let mut buf = Vec::::new(); + { + let mut writer = ThriftCompactOutputProtocol::new(&mut buf); + val.write_thrift(&mut writer).unwrap(); + } - let mut prot = ThriftSliceInputProtocol::new(writer.inner()); + let mut prot = ThriftSliceInputProtocol::new(&buf); let read_val = T::read_thrift(&mut prot).unwrap(); assert_eq!(val, read_val); } From e6d80f792f4f7f4cc9161b2fbd8ad7c892dc56c1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 30 Sep 2025 15:44:04 -0700 Subject: [PATCH 119/126] make file_path match with/without encryption --- parquet/src/file/metadata/thrift_gen.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 632c9b007617..6968d96cecc7 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -164,7 +164,7 @@ struct ColumnChunk<'a> { #[cfg(not(feature = "encryption"))] thrift_struct!( struct ColumnChunk<'a> { - 1: optional string file_path + 1: optional string<'a> file_path 2: required i64 file_offset = 0 3: optional ColumnMetaData<'a> meta_data 4: optional i64 offset_index_offset From ef5ef6d66056a797c4c8de06c0e010fe0926863a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 30 Sep 2025 15:55:15 -0700 Subject: [PATCH 120/126] clean up some docs --- parquet/src/file/metadata/thrift_gen.rs | 34 ++++++++++++++++++++----- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 6968d96cecc7..0e9d0e6b13fb 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -54,16 +54,41 @@ use crate::{ // this needs to be visible to the schema conversion code thrift_struct!( pub(crate) struct SchemaElement<'a> { - /** Data type for this field. Not set if the current element is a non-leaf node */ + /// Data type for this field. Not set if the current element is a non-leaf node 1: optional Type r#type; + /// If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values. + /// Otherwise, if specified, this is the maximum bit length to store any of the values. + /// (e.g. a low cardinality INT col could have this set to 3). Note that this is + /// in the schema, and therefore fixed for the entire file. 2: optional i32 type_length; + /// Repetition of the field. The root of the schema does not have a repetition_type. + /// All other nodes must have one. 3: optional Repetition repetition_type; + /// Name of the field in the schema 4: required string<'a> name; + /// Nested fields. Since thrift does not support nested fields, + /// the nesting is flattened to a single list by a depth-first traversal. + /// The children count is used to construct the nested relationship. + /// This field is not set when the element is a primitive type. 5: optional i32 num_children; + /// DEPRECATED: When the schema is the result of a conversion from another model. + /// Used to record the original type to help with cross conversion. + /// + /// This is superseded by logical_type. 6: optional ConvertedType converted_type; + /// DEPRECATED: Used when this column contains decimal data. + /// See the DECIMAL converted type for more details. + /// + /// This is superseded by using the DecimalType annotation in logical_type. 7: optional i32 scale 8: optional i32 precision + /// When the original schema supports field ids, this will save the + /// original field id in the parquet schema 9: optional i32 field_id; + /// The logical type of this SchemaElement + /// + /// LogicalType replaces ConvertedType, but ConvertedType is still required + /// for some logical types to ensure forward-compatibility in format v1. 10: optional LogicalType logical_type } ); @@ -112,8 +137,8 @@ pub(crate) struct FileCryptoMetaData { /// inside footer (FileMetaData structure). 1: required EncryptionAlgorithm encryption_algorithm - /** Retrieval metadata of key used for encryption of footer, - * and (possibly) columns **/ + /// Retrieval metadata of key used for encryption of footer, + /// and (possibly) columns. 2: optional binary key_metadata } ); @@ -121,7 +146,6 @@ pub(crate) struct FileCryptoMetaData { // the following are only used internally so are private thrift_struct!( struct FileMetaData<'a> { - /** Version of this file **/ 1: required i32 version 2: required list<'a> schema; 3: required i64 num_rows @@ -214,9 +238,7 @@ struct BoundingBox { thrift_struct!( struct GeospatialStatistics { - /** A bounding box of geospatial instances */ 1: optional BoundingBox bbox; - /** Geospatial type codes of all instances, or an empty list if not known */ 2: optional list geospatial_types; } ); From 0ba2bcb594b38043e073dd302e05abd5f490b5c2 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 30 Sep 2025 16:27:14 -0700 Subject: [PATCH 121/126] refactor parser to cluster more encryption specific code --- parquet/src/file/metadata/parser.rs | 199 +++++++++++------------- parquet/src/file/metadata/thrift_gen.rs | 9 ++ 2 files changed, 97 insertions(+), 111 deletions(-) diff --git a/parquet/src/file/metadata/parser.rs b/parquet/src/file/metadata/parser.rs index ccfdcaffc69f..cbe005d8f96a 100644 --- a/parquet/src/file/metadata/parser.rs +++ b/parquet/src/file/metadata/parser.rs @@ -29,16 +29,13 @@ use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol}; use bytes::Bytes; -#[cfg(feature = "encryption")] -use crate::encryption::decrypt::FileDecryptionProperties; - /// Helper struct for metadata parsing /// /// This structure parses thrift-encoded bytes into the correct Rust structs, /// such as [`ParquetMetaData`], handling decryption if necessary. // // Note this structure is used to minimize the number of -// places need to add `#[cfg(feature = "encryption")]` checks. +// places to add `#[cfg(feature = "encryption")]` checks. pub(crate) use inner::MetadataParser; #[cfg(feature = "encryption")] @@ -74,13 +71,69 @@ mod inner { buf: &[u8], encrypted_footer: bool, ) -> Result { - decode_metadata_with_encryption( - buf, - encrypted_footer, + crate::file::metadata::thrift_gen::parquet_metadata_with_encryption( self.file_decryption_properties.as_deref(), + encrypted_footer, + buf, ) } } + + pub(super) fn parse_single_column_index( + bytes: &[u8], + metadata: &ParquetMetaData, + column: &ColumnChunkMetaData, + row_group_index: usize, + col_index: usize, + ) -> crate::errors::Result { + use crate::encryption::decrypt::CryptoContext; + match &column.column_crypto_metadata { + Some(crypto_metadata) => { + let file_decryptor = metadata.file_decryptor.as_ref().ok_or_else(|| { + general_err!("Cannot decrypt column index, no file decryptor set") + })?; + let crypto_context = CryptoContext::for_column( + file_decryptor, + crypto_metadata, + row_group_index, + col_index, + )?; + let column_decryptor = crypto_context.metadata_decryptor(); + let aad = crypto_context.create_column_index_aad()?; + let plaintext = column_decryptor.decrypt(bytes, &aad)?; + decode_column_index(&plaintext, column.column_type()) + } + None => decode_column_index(bytes, column.column_type()), + } + } + + pub(super) fn parse_single_offset_index( + bytes: &[u8], + metadata: &ParquetMetaData, + column: &ColumnChunkMetaData, + row_group_index: usize, + col_index: usize, + ) -> crate::errors::Result { + use crate::encryption::decrypt::CryptoContext; + match &column.column_crypto_metadata { + Some(crypto_metadata) => { + let file_decryptor = metadata.file_decryptor.as_ref().ok_or_else(|| { + general_err!("Cannot decrypt offset index, no file decryptor set") + })?; + let crypto_context = CryptoContext::for_column( + file_decryptor, + crypto_metadata, + row_group_index, + col_index, + )?; + let column_decryptor = crypto_context.metadata_decryptor(); + let aad = crypto_context.create_offset_index_aad()?; + let plaintext = column_decryptor.decrypt(bytes, &aad)?; + decode_offset_index(&plaintext) + } + None => decode_offset_index(bytes), + } + } } #[cfg(not(feature = "encryption"))] @@ -112,6 +165,26 @@ mod inner { } } } + + pub(super) fn parse_single_column_index( + bytes: &[u8], + _metadata: &ParquetMetaData, + column: &ColumnChunkMetaData, + _row_group_index: usize, + _col_index: usize, + ) -> crate::errors::Result { + decode_column_index(bytes, column.column_type()) + } + + pub(super) fn parse_single_offset_index( + bytes: &[u8], + _metadata: &ParquetMetaData, + _column: &ColumnChunkMetaData, + _row_group_index: usize, + _col_index: usize, + ) -> crate::errors::Result { + decode_offset_index(bytes) + } } /// Decodes [`ParquetMetaData`] from the provided bytes. @@ -155,7 +228,7 @@ pub(crate) fn parse_column_index( Some(r) => { let r_start = usize::try_from(r.start - start_offset)?; let r_end = usize::try_from(r.end - start_offset)?; - parse_single_column_index( + inner::parse_single_column_index( &bytes[r_start..r_end], metadata, c, @@ -173,46 +246,6 @@ pub(crate) fn parse_column_index( Ok(()) } -#[cfg(feature = "encryption")] -fn parse_single_column_index( - bytes: &[u8], - metadata: &ParquetMetaData, - column: &ColumnChunkMetaData, - row_group_index: usize, - col_index: usize, -) -> crate::errors::Result { - use crate::encryption::decrypt::CryptoContext; - match &column.column_crypto_metadata { - Some(crypto_metadata) => { - let file_decryptor = metadata.file_decryptor.as_ref().ok_or_else(|| { - general_err!("Cannot decrypt column index, no file decryptor set") - })?; - let crypto_context = CryptoContext::for_column( - file_decryptor, - crypto_metadata, - row_group_index, - col_index, - )?; - let column_decryptor = crypto_context.metadata_decryptor(); - let aad = crypto_context.create_column_index_aad()?; - let plaintext = column_decryptor.decrypt(bytes, &aad)?; - decode_column_index(&plaintext, column.column_type()) - } - None => decode_column_index(bytes, column.column_type()), - } -} - -#[cfg(not(feature = "encryption"))] -fn parse_single_column_index( - bytes: &[u8], - _metadata: &ParquetMetaData, - column: &ColumnChunkMetaData, - _row_group_index: usize, - _col_index: usize, -) -> crate::errors::Result { - decode_column_index(bytes, column.column_type()) -} - pub(crate) fn parse_offset_index( metadata: &mut ParquetMetaData, offset_index_policy: PageIndexPolicy, @@ -231,7 +264,13 @@ pub(crate) fn parse_offset_index( Some(r) => { let r_start = usize::try_from(r.start - start_offset)?; let r_end = usize::try_from(r.end - start_offset)?; - parse_single_offset_index(&bytes[r_start..r_end], metadata, c, rg_idx, col_idx) + inner::parse_single_offset_index( + &bytes[r_start..r_end], + metadata, + c, + rg_idx, + col_idx, + ) } None => Err(general_err!("missing offset index")), }; @@ -255,65 +294,3 @@ pub(crate) fn parse_offset_index( metadata.set_offset_index(Some(all_indexes)); Ok(()) } - -#[cfg(feature = "encryption")] -fn parse_single_offset_index( - bytes: &[u8], - metadata: &ParquetMetaData, - column: &ColumnChunkMetaData, - row_group_index: usize, - col_index: usize, -) -> crate::errors::Result { - use crate::encryption::decrypt::CryptoContext; - match &column.column_crypto_metadata { - Some(crypto_metadata) => { - let file_decryptor = metadata.file_decryptor.as_ref().ok_or_else(|| { - general_err!("Cannot decrypt offset index, no file decryptor set") - })?; - let crypto_context = CryptoContext::for_column( - file_decryptor, - crypto_metadata, - row_group_index, - col_index, - )?; - let column_decryptor = crypto_context.metadata_decryptor(); - let aad = crypto_context.create_offset_index_aad()?; - let plaintext = column_decryptor.decrypt(bytes, &aad)?; - decode_offset_index(&plaintext) - } - None => decode_offset_index(bytes), - } -} - -#[cfg(not(feature = "encryption"))] -fn parse_single_offset_index( - bytes: &[u8], - _metadata: &ParquetMetaData, - _column: &ColumnChunkMetaData, - _row_group_index: usize, - _col_index: usize, -) -> crate::errors::Result { - decode_offset_index(bytes) -} - -/// Decodes [`ParquetMetaData`] from the provided bytes, handling metadata that may be encrypted. -/// -/// Typically this is used to decode the metadata from the end of a parquet -/// file. The format of `buf` is the Thrift compact binary protocol, as specified -/// by the [Parquet Spec]. Buffer can be encrypted with AES GCM or AES CTR -/// ciphers as specfied in the [Parquet Encryption Spec]. -/// -/// [Parquet Spec]: https://github.com/apache/parquet-format#metadata -/// [Parquet Encryption Spec]: https://parquet.apache.org/docs/file-format/data-pages/encryption/ -#[cfg(feature = "encryption")] -fn decode_metadata_with_encryption( - buf: &[u8], - encrypted_footer: bool, - file_decryption_properties: Option<&FileDecryptionProperties>, -) -> crate::errors::Result { - super::thrift_gen::parquet_metadata_with_encryption( - file_decryption_properties, - encrypted_footer, - buf, - ) -} diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 0e9d0e6b13fb..42ca5b1a0324 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -672,6 +672,15 @@ fn row_group_from_encrypted_thrift( } #[cfg(feature = "encryption")] +/// Decodes [`ParquetMetaData`] from the provided bytes, handling metadata that may be encrypted. +/// +/// Typically this is used to decode the metadata from the end of a parquet +/// file. The format of `buf` is the Thrift compact binary protocol, as specified +/// by the [Parquet Spec]. Buffer can be encrypted with AES GCM or AES CTR +/// ciphers as specfied in the [Parquet Encryption Spec]. +/// +/// [Parquet Spec]: https://github.com/apache/parquet-format#metadata +/// [Parquet Encryption Spec]: https://parquet.apache.org/docs/file-format/data-pages/encryption/ pub(crate) fn parquet_metadata_with_encryption( file_decryption_properties: Option<&FileDecryptionProperties>, encrypted_footer: bool, From 70efc43a3f12051fa5e18ac0eb65f049967fecae Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 30 Sep 2025 19:26:30 -0700 Subject: [PATCH 122/126] remove a few allocations --- parquet/src/file/metadata/thrift_gen.rs | 22 +++++++++++----------- parquet/src/file/metadata/writer.rs | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 42ca5b1a0324..7a0b32bfe12e 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -131,7 +131,7 @@ union EncryptionAlgorithm { #[cfg(feature = "encryption")] thrift_struct!( /// Crypto metadata for files with encrypted footer -pub(crate) struct FileCryptoMetaData { +pub(crate) struct FileCryptoMetaData<'a> { /// Encryption algorithm. This field is only used for files /// with encrypted footer. Files with plaintext footer store algorithm id /// inside footer (FileMetaData structure). @@ -139,7 +139,7 @@ pub(crate) struct FileCryptoMetaData { /// Retrieval metadata of key used for encryption of footer, /// and (possibly) columns. - 2: optional binary key_metadata + 2: optional binary<'a> key_metadata } ); @@ -151,10 +151,10 @@ struct FileMetaData<'a> { 3: required i64 num_rows 4: required list<'a> row_groups 5: optional list key_value_metadata - 6: optional string created_by + 6: optional string<'a> created_by 7: optional list column_orders; 8: optional EncryptionAlgorithm encryption_algorithm - 9: optional binary footer_signing_key_metadata + 9: optional binary<'a> footer_signing_key_metadata } ); @@ -708,7 +708,7 @@ pub(crate) fn parquet_metadata_with_encryption( } let decryptor = get_file_decryptor( t_file_crypto_metadata.encryption_algorithm, - t_file_crypto_metadata.key_metadata.as_ref(), + t_file_crypto_metadata.key_metadata, file_decryption_properties, )?; let footer_decryptor = decryptor.get_footer_decryptor(); @@ -731,7 +731,7 @@ pub(crate) fn parquet_metadata_with_encryption( } } - let file_meta = super::thrift_gen::FileMetaData::read_thrift(&mut prot) + let file_meta = FileMetaData::read_thrift(&mut prot) .map_err(|e| general_err!("Could not parse metadata: {}", e))?; let version = file_meta.version; @@ -748,7 +748,7 @@ pub(crate) fn parquet_metadata_with_encryption( // File has a plaintext footer but encryption algorithm is set let file_decryptor_value = get_file_decryptor( algo, - file_meta.footer_signing_key_metadata.as_ref(), + file_meta.footer_signing_key_metadata, file_decryption_properties, )?; if file_decryption_properties.check_plaintext_footer_integrity() && !encrypted_footer { @@ -807,9 +807,9 @@ pub(crate) fn parquet_metadata_with_encryption( } #[cfg(feature = "encryption")] -pub(super) fn get_file_decryptor( +fn get_file_decryptor( encryption_algorithm: EncryptionAlgorithm, - footer_key_metadata: Option<&Vec>, + footer_key_metadata: Option<&[u8]>, file_decryption_properties: &FileDecryptionProperties, ) -> Result { match encryption_algorithm { @@ -826,7 +826,7 @@ pub(super) fn get_file_decryptor( FileDecryptor::new( file_decryption_properties, - footer_key_metadata.map(|v| v.as_slice()), + footer_key_metadata, aad_file_unique, aad_prefix, ) @@ -841,7 +841,7 @@ pub(super) fn get_file_decryptor( /// the Parquet footer. Page indexes will need to be added later. impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ParquetMetaData { fn read_thrift(prot: &mut R) -> Result { - let file_meta = super::thrift_gen::FileMetaData::read_thrift(prot)?; + let file_meta = FileMetaData::read_thrift(prot)?; let version = file_meta.version; let num_rows = file_meta.num_rows; diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index 593ac95c78bb..97d008e17308 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -711,11 +711,11 @@ impl MetadataObjectWriter { }) } - fn file_crypto_metadata(file_encryptor: &FileEncryptor) -> Result { + fn file_crypto_metadata(file_encryptor: &'_ FileEncryptor) -> Result> { let properties = file_encryptor.properties(); Ok(FileCryptoMetaData { encryption_algorithm: Self::encryption_algorithm_from_encryptor(file_encryptor), - key_metadata: properties.footer_key_metadata().cloned(), + key_metadata: properties.footer_key_metadata().map(|v| v.as_slice()), }) } From bb5b6888bea7a08aefa634ac77feee32c33a6534 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 1 Oct 2025 06:56:21 -0700 Subject: [PATCH 123/126] remove TODO --- parquet/src/file/metadata/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index abf0542c969a..b7e99e67b632 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -555,7 +555,6 @@ impl FileMetaData { } } -// TODO: should this move to thrift_gen? thrift_struct!( /// Sort order within a RowGroup of a leaf column pub struct SortingColumn { From 6febae06c04ac215ade8978ce00dc043d3e2463b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 1 Oct 2025 09:49:46 -0700 Subject: [PATCH 124/126] merge in changes to geo spatial stats --- parquet/src/basic.rs | 49 ++++++++-- parquet/src/file/metadata/thrift_gen.rs | 2 +- parquet/src/geospatial/statistics.rs | 6 +- parquet/src/schema/printer.rs | 73 +++++++++++++- parquet/tests/geospatial.rs | 123 ++++++++++++++++++++++++ 5 files changed, 239 insertions(+), 14 deletions(-) create mode 100644 parquet/tests/geospatial.rs diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 350f2b6de1e2..3eadbb3307c1 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -349,7 +349,8 @@ pub enum LogicalType { }, /// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation. Geometry { - /// A custom CRS. If unset the defaults to `OGC:CRS84`. + /// A custom CRS. If unset the defaults to `OGC:CRS84`, which means that the geometries + /// must be stored in longitude, latitude based on the WGS84 datum. crs: Option, }, /// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation. @@ -357,7 +358,7 @@ pub enum LogicalType { /// A custom CRS. If unset the defaults to `OGC:CRS84`. crs: Option, /// An optional algorithm can be set to correctly interpret edges interpolation - /// of the geometries. If unset, the algorithm defaults to `SPHERICAL``. + /// of the geometries. If unset, the algorithm defaults to `SPHERICAL`. algorithm: Option, }, /// For forward compatibility; used when an unknown union value is encountered. @@ -456,9 +457,10 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType { } 18 => { let val = GeographyType::read_thrift(&mut *prot)?; + let algorithm = val.algorithm.unwrap_or_default(); Self::Geography { crs: val.crs.map(|s| s.to_owned()), - algorithm: val.algorithm, + algorithm: Some(algorithm), } } _ => { @@ -928,17 +930,31 @@ enum BoundaryOrder { // ---------------------------------------------------------------------- // Mirrors thrift enum `EdgeInterpolationAlgorithm` +// TODO(ets): we need to allow for unknown variants. Either hand code this one, or add a new +// macro that adds an _Unknown variant. + thrift_enum!( /// Edge interpolation algorithm for Geography logical type enum EdgeInterpolationAlgorithm { + /// Edges are interpolated as geodesics on a sphere. SPHERICAL = 0; + /// VINCENTY = 1; + /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970 THOMAS = 2; + /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. ANDOYER = 3; + /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55 KARNEY = 4; } ); +impl Default for EdgeInterpolationAlgorithm { + fn default() -> Self { + Self::SPHERICAL + } +} + // ---------------------------------------------------------------------- // Mirrors thrift union `BloomFilterAlgorithm` @@ -1359,7 +1375,7 @@ impl str::FromStr for LogicalType { "GEOMETRY" => Ok(LogicalType::Geometry { crs: None }), "GEOGRAPHY" => Ok(LogicalType::Geography { crs: None, - algorithm: None, + algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), }), other => Err(general_err!("Invalid parquet logical type {}", other)), } @@ -1816,6 +1832,17 @@ mod tests { ConvertedType::from(Some(LogicalType::Float16)), ConvertedType::NONE ); + assert_eq!( + ConvertedType::from(Some(LogicalType::Geometry { crs: None })), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::Geography { + crs: None, + algorithm: Some(EdgeInterpolationAlgorithm::default()), + })), + ConvertedType::NONE + ); assert_eq!( ConvertedType::from(Some(LogicalType::Unknown)), ConvertedType::NONE @@ -1897,11 +1924,11 @@ mod tests { }); test_roundtrip(LogicalType::Geography { crs: Some("foo".to_owned()), - algorithm: None, + algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), }); test_roundtrip(LogicalType::Geography { crs: None, - algorithm: None, + algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), }); } @@ -2113,7 +2140,15 @@ mod tests { check_sort_order(signed, SortOrder::SIGNED); // Undefined comparison - let undefined = vec![LogicalType::List, LogicalType::Map]; + let undefined = vec![ + LogicalType::List, + LogicalType::Map, + LogicalType::Geometry { crs: None }, + LogicalType::Geography { + crs: None, + algorithm: Some(EdgeInterpolationAlgorithm::default()), + }, + ]; check_sort_order(undefined, SortOrder::UNDEFINED); } diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 7a0b32bfe12e..489cb44cd77b 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -1585,7 +1585,7 @@ impl WriteThrift for crate::geospatial::statistics::GeospatialStatistics { fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; - if let Some(bbox) = self.bbox() { + if let Some(bbox) = self.bounding_box() { last_field_id = bbox.write_thrift_field(writer, 1, last_field_id)?; } if let Some(geo_types) = self.geospatial_types() { diff --git a/parquet/src/geospatial/statistics.rs b/parquet/src/geospatial/statistics.rs index 2e99d9c62aff..d3287412b143 100644 --- a/parquet/src/geospatial/statistics.rs +++ b/parquet/src/geospatial/statistics.rs @@ -58,12 +58,12 @@ impl GeospatialStatistics { } } - /// Return the optional `BoundingBox`. - pub fn bbox(&self) -> Option<&BoundingBox> { + /// Optional bounding defining the spatial extent, where `None` represents a lack of information. + pub fn bounding_box(&self) -> Option<&BoundingBox> { self.bbox.as_ref() } - /// Return the optional list of geospatial types. + /// Optional list of geometry type identifiers, where `None` represents a lack of information. pub fn geospatial_types(&self) -> Option<&Vec> { self.geospatial_types.as_ref() } diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index fd28e13d2edf..0cc5df59f329 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -329,9 +329,20 @@ fn print_logical_and_converted( LogicalType::Variant { specification_version, } => format!("VARIANT({specification_version:?})"), - LogicalType::Geometry { crs } => format!("GEOMETRY({crs:?})"), + LogicalType::Geometry { crs } => { + if let Some(crs) = crs { + format!("GEOMETRY({crs})") + } else { + "GEOMETRY".to_string() + } + } LogicalType::Geography { crs, algorithm } => { - format!("GEOGRAPHY({crs:?},{algorithm:?})") + let algorithm = algorithm.unwrap_or_default(); + if let Some(crs) = crs { + format!("GEOGRAPHY({algorithm}, {crs})") + } else { + format!("GEOGRAPHY({algorithm})") + } } LogicalType::Unknown => "UNKNOWN".to_string(), LogicalType::_Unknown { field_id } => format!("_Unknown({field_id})"), @@ -454,7 +465,7 @@ mod tests { use std::sync::Arc; - use crate::basic::{Repetition, Type as PhysicalType}; + use crate::basic::{EdgeInterpolationAlgorithm, Repetition, Type as PhysicalType}; use crate::errors::Result; use crate::schema::parser::parse_message_type; @@ -784,6 +795,62 @@ mod tests { .unwrap(), "REQUIRED BYTE_ARRAY field [42] (STRING);", ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geometry { crs: None }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOMETRY);", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geometry { + crs: Some("non-missing CRS".to_string()), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOMETRY(non-missing CRS));", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geography { + crs: None, + algorithm: Some(EdgeInterpolationAlgorithm::default()), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL));", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geography { + crs: Some("non-missing CRS".to_string()), + algorithm: Some(EdgeInterpolationAlgorithm::default()), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL, non-missing CRS));", + ), ]; types_and_strings.into_iter().for_each(|(field, expected)| { diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs new file mode 100644 index 000000000000..b3de40491b30 --- /dev/null +++ b/parquet/tests/geospatial.rs @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for Geometry and Geography logical types +use parquet::{ + basic::{EdgeInterpolationAlgorithm, LogicalType}, + file::{ + metadata::ParquetMetaData, + reader::{FileReader, SerializedFileReader}, + }, + geospatial::bounding_box::BoundingBox, +}; +use serde_json::Value; +use std::fs::File; + +fn read_metadata(geospatial_test_file: &str) -> ParquetMetaData { + let path = format!( + "{}/geospatial/{geospatial_test_file}", + arrow::util::test_util::parquet_test_data(), + ); + let file = File::open(path).unwrap(); + let reader = SerializedFileReader::try_from(file).unwrap(); + reader.metadata().clone() +} + +#[test] +fn test_read_logical_type() { + // Some crs values are short strings + let expected_logical_type = [ + ("crs-default.parquet", LogicalType::Geometry { crs: None }), + ( + "crs-srid.parquet", + LogicalType::Geometry { + crs: Some("srid:5070".to_string()), + }, + ), + ( + "crs-projjson.parquet", + LogicalType::Geometry { + crs: Some("projjson:projjson_epsg_5070".to_string()), + }, + ), + ( + "crs-geography.parquet", + LogicalType::Geography { + crs: None, + algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), + }, + ), + ]; + + for (geospatial_file, expected_type) in expected_logical_type { + let metadata = read_metadata(geospatial_file); + let logical_type = metadata + .file_metadata() + .schema_descr() + .column(1) + .logical_type() + .unwrap(); + + assert_eq!(logical_type, expected_type); + } + + // The crs value may also contain arbitrary values (in this case some JSON + // a bit too lengthy to type out) + let metadata = read_metadata("crs-arbitrary-value.parquet"); + let logical_type = metadata + .file_metadata() + .schema_descr() + .column(1) + .logical_type() + .unwrap(); + + if let LogicalType::Geometry { crs } = logical_type { + let crs_parsed: Value = serde_json::from_str(&crs.unwrap()).unwrap(); + assert_eq!(crs_parsed.get("id").unwrap().get("code").unwrap(), 5070); + } else { + panic!("Expected geometry type but got {logical_type:?}"); + } +} + +#[test] +fn test_read_geospatial_statistics() { + let metadata = read_metadata("geospatial.parquet"); + + // geospatial.parquet schema: + // optional binary field_id=-1 group (String); + // optional binary field_id=-1 wkt (String); + // optional binary field_id=-1 geometry (Geometry(crs=)); + let fields = metadata.file_metadata().schema().get_fields(); + let logical_type = fields[2].get_basic_info().logical_type().unwrap(); + assert_eq!(logical_type, LogicalType::Geometry { crs: None }); + + let geo_statistics = metadata.row_group(0).column(2).geo_statistics(); + assert!(geo_statistics.is_some()); + + let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0) + .with_zrange(30.0, 80.0) + .with_mrange(200.0, 1600.0); + let expected_geospatial_types = vec![ + 1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 2001, 2002, 2003, 2004, + 2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007, + ]; + assert_eq!( + geo_statistics.unwrap().geospatial_types(), + Some(&expected_geospatial_types) + ); + assert_eq!(geo_statistics.unwrap().bounding_box(), Some(&expected_bbox)); +} From 793db5bffd46e5b8c3e045f314aaa5b7f43ba932 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 1 Oct 2025 15:14:34 -0700 Subject: [PATCH 125/126] allow for unknown variants and some doc cleanups make default to SPHERICAL explicit --- parquet/src/basic.rs | 92 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 73 insertions(+), 19 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 3eadbb3307c1..7e57b12b6b62 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -457,7 +457,11 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType { } 18 => { let val = GeographyType::read_thrift(&mut *prot)?; - let algorithm = val.algorithm.unwrap_or_default(); + // unset algorithm means SPHERICAL, per the spec: + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#geography + let algorithm = val + .algorithm + .unwrap_or(EdgeInterpolationAlgorithm::SPHERICAL); Self::Geography { crs: val.crs.map(|s| s.to_owned()), algorithm: Some(algorithm), @@ -930,24 +934,74 @@ enum BoundaryOrder { // ---------------------------------------------------------------------- // Mirrors thrift enum `EdgeInterpolationAlgorithm` -// TODO(ets): we need to allow for unknown variants. Either hand code this one, or add a new -// macro that adds an _Unknown variant. +// this is hand coded to allow for the _Unknown variant (allows this to be forward compatible) + +/// Edge interpolation algorithm for [`LogicalType::Geography`] +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[repr(i32)] +pub enum EdgeInterpolationAlgorithm { + /// Edges are interpolated as geodesics on a sphere. + SPHERICAL = 0, + /// + VINCENTY = 1, + /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970 + THOMAS = 2, + /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. + ANDOYER = 3, + /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55 + KARNEY = 4, + /// Unknown algorithm + _Unknown(i32), +} -thrift_enum!( -/// Edge interpolation algorithm for Geography logical type -enum EdgeInterpolationAlgorithm { - /// Edges are interpolated as geodesics on a sphere. - SPHERICAL = 0; - /// - VINCENTY = 1; - /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970 - THOMAS = 2; - /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. - ANDOYER = 3; - /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55 - KARNEY = 4; +impl fmt::Display for EdgeInterpolationAlgorithm { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_fmt(format_args!("{0:?}", self)) + } +} + +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EdgeInterpolationAlgorithm { + #[allow(deprecated)] + fn read_thrift(prot: &mut R) -> Result { + let val = prot.read_i32()?; + match val { + 0 => Ok(Self::SPHERICAL), + 1 => Ok(Self::VINCENTY), + 2 => Ok(Self::THOMAS), + 3 => Ok(Self::ANDOYER), + 4 => Ok(Self::KARNEY), + _ => Ok(Self::_Unknown(val)), + } + } +} + +impl WriteThrift for EdgeInterpolationAlgorithm { + const ELEMENT_TYPE: ElementType = ElementType::I32; + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let val: i32 = match *self { + Self::SPHERICAL => 0, + Self::VINCENTY => 1, + Self::THOMAS => 2, + Self::ANDOYER => 3, + Self::KARNEY => 4, + Self::_Unknown(i) => i, + }; + writer.write_i32(val) + } +} + +impl WriteThriftField for EdgeInterpolationAlgorithm { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::I32, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } } -); impl Default for EdgeInterpolationAlgorithm { fn default() -> Self { @@ -961,7 +1015,7 @@ impl Default for EdgeInterpolationAlgorithm { thrift_union_all_empty!( /// The algorithm used in Bloom filter. union BloomFilterAlgorithm { - /** Block-based Bloom filter. **/ + /// Block-based Bloom filter. 1: SplitBlockAlgorithm BLOCK; } ); @@ -973,7 +1027,7 @@ thrift_union_all_empty!( /// The hash function used in Bloom filter. This function takes the hash of a column value /// using plain encoding. union BloomFilterHash { - /** xxHash Strategy. **/ + /// xxHash Strategy. 1: XxHash XXHASH; } ); From c37bce2ce185cbe7b952114497a10413192623fd Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 2 Oct 2025 09:28:04 -0700 Subject: [PATCH 126/126] clean up leftover #allow --- parquet/src/basic.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 7e57b12b6b62..68eebaf5080a 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -961,7 +961,6 @@ impl fmt::Display for EdgeInterpolationAlgorithm { } impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EdgeInterpolationAlgorithm { - #[allow(deprecated)] fn read_thrift(prot: &mut R) -> Result { let val = prot.read_i32()?; match val {