Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 59 additions & 14 deletions parquet/src/file/metadata/thrift/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,20 +192,19 @@ fn convert_stats(
use crate::file::statistics::Statistics as FStatistics;
Ok(match thrift_stats {
Some(stats) => {
// Number of nulls recorded, when it is not available, we just mark it as 0.
// TODO this should be `None` if there is no information about NULLS.
// see https://github.com/apache/arrow-rs/pull/6216/files
let null_count = stats.null_count.unwrap_or(0);

if null_count < 0 {
return Err(general_err!(
"Statistics null count is negative {}",
null_count
));
}

// Generic null count.
let null_count = Some(null_count as u64);
let null_count = stats
.null_count
.map(|null_count| {
if null_count < 0 {
return Err(general_err!(
"Statistics null count is negative {}",
Copy link
Contributor Author

@scovich scovich Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation change only (review with whitespace ignored)

(again below)

null_count
));
}
Ok(null_count as u64)
})
.transpose()?;
// Generic distinct count (count of distinct values occurring)
let distinct_count = stats.distinct_count.map(|value| value as u64);
// Whether or not statistics use deprecated min/max fields.
Expand Down Expand Up @@ -1722,6 +1721,7 @@ write_thrift_field!(RustBoundingBox, FieldType::Struct);

#[cfg(test)]
pub(crate) mod tests {
use crate::basic::Type as PhysicalType;
use crate::errors::Result;
use crate::file::metadata::thrift::{BoundingBox, SchemaElement, write_schema};
use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaDataOptions, RowGroupMetaData};
Expand All @@ -1730,7 +1730,8 @@ pub(crate) mod tests {
ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol, read_thrift_vec,
};
use crate::schema::types::{
ColumnDescriptor, SchemaDescriptor, TypePtr, num_nodes, parquet_schema_from_array,
ColumnDescriptor, ColumnPath, SchemaDescriptor, TypePtr, num_nodes,
parquet_schema_from_array,
};
use std::sync::Arc;

Expand Down Expand Up @@ -1828,4 +1829,48 @@ pub(crate) mod tests {
mmax: Some(42.0.into()),
});
}

#[test]
fn test_convert_stats_preserves_missing_null_count() {
let primitive =
crate::schema::types::Type::primitive_type_builder("col", PhysicalType::INT32)
.build()
.unwrap();
let column_descr = Arc::new(ColumnDescriptor::new(
Arc::new(primitive),
0,
0,
ColumnPath::new(vec![]),
));

let none_null_count = super::Statistics {
max: None,
min: None,
null_count: None,
distinct_count: None,
max_value: None,
min_value: None,
is_max_value_exact: None,
is_min_value_exact: None,
};
let decoded_none = super::convert_stats(&column_descr, Some(none_null_count))
.unwrap()
.unwrap();
assert_eq!(decoded_none.null_count_opt(), None);

let zero_null_count = super::Statistics {
max: None,
min: None,
null_count: Some(0),
distinct_count: None,
max_value: None,
min_value: None,
is_max_value_exact: None,
is_min_value_exact: None,
};
let decoded_zero = super::convert_stats(&column_descr, Some(zero_null_count))
.unwrap()
.unwrap();
assert_eq!(decoded_zero.null_count_opt(), Some(0));
}
}
43 changes: 12 additions & 31 deletions parquet/src/file/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,19 +125,18 @@ pub(crate) fn from_thrift_page_stats(
) -> Result<Option<Statistics>> {
Ok(match thrift_stats {
Some(stats) => {
// Number of nulls recorded, when it is not available, we just mark it as 0.
// TODO this should be `None` if there is no information about NULLS.
// see https://github.com/apache/arrow-rs/pull/6216/files
let null_count = stats.null_count.unwrap_or(0);

if null_count < 0 {
return Err(ParquetError::General(format!(
"Statistics null count is negative {null_count}",
)));
}

// Generic null count.
let null_count = Some(null_count as u64);
let null_count = stats
.null_count
.map(|null_count| {
if null_count < 0 {
return Err(ParquetError::General(format!(
"Statistics null count is negative {null_count}",
)));
}
Ok(null_count as u64)
})
.transpose()?;
// Generic distinct count (count of distinct values occurring)
let distinct_count = stats.distinct_count.map(|value| value as u64);
// Whether or not statistics use deprecated min/max fields.
Expand Down Expand Up @@ -430,10 +429,6 @@ impl Statistics {

/// Returns number of null values for the column, if known.
/// Note that this includes all nulls when column is part of the complex type.
///
/// Note this API returns Some(0) even if the null count was not present
/// in the statistics.
/// See <https://github.com/apache/arrow-rs/pull/6216/files>
pub fn null_count_opt(&self) -> Option<u64> {
statistics_enum_func![self, null_count_opt]
}
Expand Down Expand Up @@ -1064,21 +1059,7 @@ mod tests {
let round_tripped = from_thrift_page_stats(Type::BOOLEAN, Some(thrift_stats))
.unwrap()
.unwrap();
// TODO: remove branch when we no longer support assuming null_count==None in the thrift
// means null_count = Some(0)
if null_count.is_none() {
assert_ne!(round_tripped, statistics);
assert!(round_tripped.null_count_opt().is_some());
assert_eq!(round_tripped.null_count_opt(), Some(0));
assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
assert_eq!(
round_tripped.distinct_count_opt(),
statistics.distinct_count_opt()
);
} else {
assert_eq!(round_tripped, statistics);
}
assert_eq!(round_tripped, statistics);
}

fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
Expand Down
Loading