-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Support writing GeospatialStatistics in Parquet writer #8524
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
55e32d2
9a88f7e
6a34eaf
d59c644
2f268f3
5c9547e
6516544
629afc7
7ae4f17
523575d
203ea9c
5305e4e
3089b69
9e12b57
a3b729b
f8b58c6
7472ba6
be7b522
e60cd98
a34e5c4
15bbe3c
0fca11f
cd0f609
92d0d73
182776e
4c7c52a
0e600b4
59a00ed
a024793
3798609
85ebb72
2bc7bbe
431da25
6156112
ed85f90
9d6d6c3
392d949
f9112f7
ec31096
eac356e
1bb2cd8
d5ba2f2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -229,11 +229,18 @@ impl<W: Write + Send> ArrowWriter<W> { | |
| options: ArrowWriterOptions, | ||
| ) -> Result<Self> { | ||
| let mut props = options.properties; | ||
| let mut converter = ArrowSchemaConverter::new().with_coerce_types(props.coerce_types()); | ||
| if let Some(schema_root) = &options.schema_root { | ||
| converter = converter.schema_root(schema_root); | ||
| } | ||
| let schema = converter.convert(&arrow_schema)?; | ||
|
|
||
| let schema = if let Some(parquet_schema) = options.schema_descr { | ||
| parquet_schema.clone() | ||
| } else { | ||
| let mut converter = ArrowSchemaConverter::new().with_coerce_types(props.coerce_types()); | ||
| if let Some(schema_root) = &options.schema_root { | ||
| converter = converter.schema_root(schema_root); | ||
| } | ||
|
|
||
| converter.convert(&arrow_schema)? | ||
| }; | ||
|
|
||
| if !options.skip_arrow_metadata { | ||
| // add serialized arrow schema | ||
| add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut props); | ||
|
|
@@ -458,6 +465,7 @@ pub struct ArrowWriterOptions { | |
| properties: WriterProperties, | ||
| skip_arrow_metadata: bool, | ||
| schema_root: Option<String>, | ||
| schema_descr: Option<SchemaDescriptor>, | ||
| } | ||
|
|
||
| impl ArrowWriterOptions { | ||
|
|
@@ -491,6 +499,14 @@ impl ArrowWriterOptions { | |
| ..self | ||
| } | ||
| } | ||
|
|
||
| /// Explicitly specify the Parquet schema to be used | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a nice API addition I think
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this API actually ends up being a bit problematic, the reason being the type inference and coercion machinery are supposed to mirror each other. With this change:
Further this interferes with removing arrow_cast as a dependency - #9077 I'm not sure what the intention of this API is, why can't the arrays just be cast before being written, why does this logic need to live within the parquet writer itself?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think one rationale was to put the appropriate metadata on the field (so the parquet writer knew what logical type to add), but I make be mistaken I don't fully understand the concerns about type coercion, but at least part of this API I think is designed to allow interoperability between other arrow implementations (aka not just reading back arrays that were written in Rust, but writing arrays that other writers will accept)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I put a note on the other PR as well, but the intention was really just to be able to add the test that I needed to add at the time. I don't have opinions about how this kind of thing should work here in particular, but a schema request across a type boundary (e.g. |
||
| pub fn with_parquet_schema(self, schema_descr: SchemaDescriptor) -> Self { | ||
| Self { | ||
| schema_descr: Some(schema_descr), | ||
| ..self | ||
| } | ||
| } | ||
paleolimbot marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| /// A single column chunk produced by [`ArrowColumnWriter`] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,10 @@ use crate::data_type::DataType; | |
| use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; | ||
| use crate::errors::{ParquetError, Result}; | ||
| use crate::file::properties::{EnabledStatistics, WriterProperties}; | ||
| use crate::geospatial::accumulator::{ | ||
| DefaultGeoStatsAccumulatorFactory, GeoStatsAccumulator, GeoStatsAccumulatorFactory, | ||
| }; | ||
| use crate::geospatial::statistics::GeospatialStatistics; | ||
| use crate::schema::types::{ColumnDescPtr, ColumnDescriptor}; | ||
|
|
||
| /// A collection of [`ParquetValueType`] encoded by a [`ColumnValueEncoder`] | ||
|
|
@@ -121,6 +125,8 @@ pub trait ColumnValueEncoder { | |
| /// will *not* be tracked by the bloom filter as it is empty since. This should be called once | ||
| /// near the end of encoding. | ||
| fn flush_bloom_filter(&mut self) -> Option<Sbbf>; | ||
|
|
||
| fn flush_geospatial_statistics(&mut self) -> Option<Box<GeospatialStatistics>>; | ||
paleolimbot marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| pub struct ColumnValueEncoderImpl<T: DataType> { | ||
|
|
@@ -133,6 +139,7 @@ pub struct ColumnValueEncoderImpl<T: DataType> { | |
| max_value: Option<T::T>, | ||
| bloom_filter: Option<Sbbf>, | ||
| variable_length_bytes: Option<i64>, | ||
| geo_stats_accumulator: Option<Box<dyn GeoStatsAccumulator>>, | ||
| } | ||
|
|
||
| impl<T: DataType> ColumnValueEncoderImpl<T> { | ||
|
|
@@ -145,16 +152,20 @@ impl<T: DataType> ColumnValueEncoderImpl<T> { | |
|
|
||
| fn write_slice(&mut self, slice: &[T::T]) -> Result<()> { | ||
| if self.statistics_enabled != EnabledStatistics::None | ||
| // INTERVAL has undefined sort order, so don't write min/max stats for it | ||
| // INTERVAL, Geometry, and Geography have undefined sort order,so don't write min/max stats for them | ||
paleolimbot marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| && self.descr.converted_type() != ConvertedType::INTERVAL | ||
| { | ||
| if let Some((min, max)) = self.min_max(slice, None) { | ||
| update_min(&self.descr, &min, &mut self.min_value); | ||
| update_max(&self.descr, &max, &mut self.max_value); | ||
| } | ||
| if let Some(accumulator) = self.geo_stats_accumulator.as_mut() { | ||
| update_geo_stats_accumulator(accumulator.as_mut(), slice.iter()); | ||
paleolimbot marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } else { | ||
| if let Some((min, max)) = self.min_max(slice, None) { | ||
| update_min(&self.descr, &min, &mut self.min_value); | ||
| update_max(&self.descr, &max, &mut self.max_value); | ||
| } | ||
|
|
||
| if let Some(var_bytes) = T::T::variable_length_bytes(slice) { | ||
| *self.variable_length_bytes.get_or_insert(0) += var_bytes; | ||
| if let Some(var_bytes) = T::T::variable_length_bytes(slice) { | ||
| *self.variable_length_bytes.get_or_insert(0) += var_bytes; | ||
| } | ||
|
||
| } | ||
| } | ||
|
|
||
|
|
@@ -201,6 +212,15 @@ impl<T: DataType> ColumnValueEncoder for ColumnValueEncoderImpl<T> { | |
| .map(|props| Sbbf::new_with_ndv_fpp(props.ndv, props.fpp)) | ||
| .transpose()?; | ||
|
|
||
| let geo_stats_accumulator = if matches!( | ||
| descr.logical_type(), | ||
| Some(LogicalType::Geometry) | Some(LogicalType::Geography) | ||
| ) { | ||
| Some(DefaultGeoStatsAccumulatorFactory::default().new_accumulator(descr)) | ||
| } else { | ||
| None | ||
| }; | ||
|
|
||
| Ok(Self { | ||
| encoder, | ||
| dict_encoder, | ||
|
|
@@ -211,6 +231,7 @@ impl<T: DataType> ColumnValueEncoder for ColumnValueEncoderImpl<T> { | |
| min_value: None, | ||
| max_value: None, | ||
| variable_length_bytes: None, | ||
| geo_stats_accumulator, | ||
| }) | ||
| } | ||
|
|
||
|
|
@@ -307,6 +328,14 @@ impl<T: DataType> ColumnValueEncoder for ColumnValueEncoderImpl<T> { | |
| variable_length_bytes: self.variable_length_bytes.take(), | ||
| }) | ||
| } | ||
|
|
||
| fn flush_geospatial_statistics(&mut self) -> Option<Box<GeospatialStatistics>> { | ||
| if let Some(accumulator) = self.geo_stats_accumulator.as_mut() { | ||
| accumulator.finish() | ||
| } else { | ||
| None | ||
| } | ||
paleolimbot marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
| fn get_min_max<'a, T, I>(descr: &ColumnDescriptor, mut iter: I) -> Option<(T, T)> | ||
|
|
@@ -367,3 +396,17 @@ fn replace_zero<T: ParquetValueType>(val: &T, descr: &ColumnDescriptor, replace: | |
| _ => val.clone(), | ||
| } | ||
| } | ||
|
|
||
| fn update_geo_stats_accumulator<'a, T, I>(bounder: &mut dyn GeoStatsAccumulator, iter: I) | ||
| where | ||
| T: ParquetValueType + 'a, | ||
| I: Iterator<Item = &'a T>, | ||
| { | ||
| if !bounder.is_valid() { | ||
| return; | ||
| } | ||
|
|
||
| for val in iter { | ||
| bounder.update_wkb(val.as_bytes()); | ||
| } | ||
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please also add the new feature flag to the main crate readme as well?
https://github.com/apache/arrow-rs/blob/main/parquet/README.md#feature-flags