diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 350f2b6de1e2..68eebaf5080a 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -349,7 +349,8 @@ pub enum LogicalType { }, /// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation. Geometry { - /// A custom CRS. If unset the defaults to `OGC:CRS84`. + /// A custom CRS. If unset the defaults to `OGC:CRS84`, which means that the geometries + /// must be stored in longitude, latitude based on the WGS84 datum. crs: Option, }, /// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation. @@ -357,7 +358,7 @@ pub enum LogicalType { /// A custom CRS. If unset the defaults to `OGC:CRS84`. crs: Option, /// An optional algorithm can be set to correctly interpret edges interpolation - /// of the geometries. If unset, the algorithm defaults to `SPHERICAL``. + /// of the geometries. If unset, the algorithm defaults to `SPHERICAL`. algorithm: Option, }, /// For forward compatibility; used when an unknown union value is encountered. @@ -456,9 +457,14 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType { } 18 => { let val = GeographyType::read_thrift(&mut *prot)?; + // unset algorithm means SPHERICAL, per the spec: + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#geography + let algorithm = val + .algorithm + .unwrap_or(EdgeInterpolationAlgorithm::SPHERICAL); Self::Geography { crs: val.crs.map(|s| s.to_owned()), - algorithm: val.algorithm, + algorithm: Some(algorithm), } } _ => { @@ -928,16 +934,79 @@ enum BoundaryOrder { // ---------------------------------------------------------------------- // Mirrors thrift enum `EdgeInterpolationAlgorithm` -thrift_enum!( -/// Edge interpolation algorithm for Geography logical type -enum EdgeInterpolationAlgorithm { - SPHERICAL = 0; - VINCENTY = 1; - THOMAS = 2; - ANDOYER = 3; - KARNEY = 4; +// this is hand coded to allow for the _Unknown variant (allows this to be forward compatible) + +/// Edge interpolation algorithm for [`LogicalType::Geography`] +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[repr(i32)] +pub enum EdgeInterpolationAlgorithm { + /// Edges are interpolated as geodesics on a sphere. + SPHERICAL = 0, + /// + VINCENTY = 1, + /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970 + THOMAS = 2, + /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. + ANDOYER = 3, + /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55 + KARNEY = 4, + /// Unknown algorithm + _Unknown(i32), +} + +impl fmt::Display for EdgeInterpolationAlgorithm { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_fmt(format_args!("{0:?}", self)) + } +} + +impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EdgeInterpolationAlgorithm { + fn read_thrift(prot: &mut R) -> Result { + let val = prot.read_i32()?; + match val { + 0 => Ok(Self::SPHERICAL), + 1 => Ok(Self::VINCENTY), + 2 => Ok(Self::THOMAS), + 3 => Ok(Self::ANDOYER), + 4 => Ok(Self::KARNEY), + _ => Ok(Self::_Unknown(val)), + } + } +} + +impl WriteThrift for EdgeInterpolationAlgorithm { + const ELEMENT_TYPE: ElementType = ElementType::I32; + fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { + let val: i32 = match *self { + Self::SPHERICAL => 0, + Self::VINCENTY => 1, + Self::THOMAS => 2, + Self::ANDOYER => 3, + Self::KARNEY => 4, + Self::_Unknown(i) => i, + }; + writer.write_i32(val) + } +} + +impl WriteThriftField for EdgeInterpolationAlgorithm { + fn write_thrift_field( + &self, + writer: &mut ThriftCompactOutputProtocol, + field_id: i16, + last_field_id: i16, + ) -> Result { + writer.write_field_begin(FieldType::I32, field_id, last_field_id)?; + self.write_thrift(writer)?; + Ok(field_id) + } +} + +impl Default for EdgeInterpolationAlgorithm { + fn default() -> Self { + Self::SPHERICAL + } } -); // ---------------------------------------------------------------------- // Mirrors thrift union `BloomFilterAlgorithm` @@ -945,7 +1014,7 @@ enum EdgeInterpolationAlgorithm { thrift_union_all_empty!( /// The algorithm used in Bloom filter. union BloomFilterAlgorithm { - /** Block-based Bloom filter. **/ + /// Block-based Bloom filter. 1: SplitBlockAlgorithm BLOCK; } ); @@ -957,7 +1026,7 @@ thrift_union_all_empty!( /// The hash function used in Bloom filter. This function takes the hash of a column value /// using plain encoding. union BloomFilterHash { - /** xxHash Strategy. **/ + /// xxHash Strategy. 1: XxHash XXHASH; } ); @@ -1359,7 +1428,7 @@ impl str::FromStr for LogicalType { "GEOMETRY" => Ok(LogicalType::Geometry { crs: None }), "GEOGRAPHY" => Ok(LogicalType::Geography { crs: None, - algorithm: None, + algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), }), other => Err(general_err!("Invalid parquet logical type {}", other)), } @@ -1816,6 +1885,17 @@ mod tests { ConvertedType::from(Some(LogicalType::Float16)), ConvertedType::NONE ); + assert_eq!( + ConvertedType::from(Some(LogicalType::Geometry { crs: None })), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::Geography { + crs: None, + algorithm: Some(EdgeInterpolationAlgorithm::default()), + })), + ConvertedType::NONE + ); assert_eq!( ConvertedType::from(Some(LogicalType::Unknown)), ConvertedType::NONE @@ -1897,11 +1977,11 @@ mod tests { }); test_roundtrip(LogicalType::Geography { crs: Some("foo".to_owned()), - algorithm: None, + algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), }); test_roundtrip(LogicalType::Geography { crs: None, - algorithm: None, + algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), }); } @@ -2113,7 +2193,15 @@ mod tests { check_sort_order(signed, SortOrder::SIGNED); // Undefined comparison - let undefined = vec![LogicalType::List, LogicalType::Map]; + let undefined = vec![ + LogicalType::List, + LogicalType::Map, + LogicalType::Geometry { crs: None }, + LogicalType::Geography { + crs: None, + algorithm: Some(EdgeInterpolationAlgorithm::default()), + }, + ]; check_sort_order(undefined, SortOrder::UNDEFINED); } diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs index 7a0b32bfe12e..489cb44cd77b 100644 --- a/parquet/src/file/metadata/thrift_gen.rs +++ b/parquet/src/file/metadata/thrift_gen.rs @@ -1585,7 +1585,7 @@ impl WriteThrift for crate::geospatial::statistics::GeospatialStatistics { fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol) -> Result<()> { let mut last_field_id = 0i16; - if let Some(bbox) = self.bbox() { + if let Some(bbox) = self.bounding_box() { last_field_id = bbox.write_thrift_field(writer, 1, last_field_id)?; } if let Some(geo_types) = self.geospatial_types() { diff --git a/parquet/src/geospatial/statistics.rs b/parquet/src/geospatial/statistics.rs index 2e99d9c62aff..d3287412b143 100644 --- a/parquet/src/geospatial/statistics.rs +++ b/parquet/src/geospatial/statistics.rs @@ -58,12 +58,12 @@ impl GeospatialStatistics { } } - /// Return the optional `BoundingBox`. - pub fn bbox(&self) -> Option<&BoundingBox> { + /// Optional bounding defining the spatial extent, where `None` represents a lack of information. + pub fn bounding_box(&self) -> Option<&BoundingBox> { self.bbox.as_ref() } - /// Return the optional list of geospatial types. + /// Optional list of geometry type identifiers, where `None` represents a lack of information. pub fn geospatial_types(&self) -> Option<&Vec> { self.geospatial_types.as_ref() } diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index fd28e13d2edf..0cc5df59f329 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -329,9 +329,20 @@ fn print_logical_and_converted( LogicalType::Variant { specification_version, } => format!("VARIANT({specification_version:?})"), - LogicalType::Geometry { crs } => format!("GEOMETRY({crs:?})"), + LogicalType::Geometry { crs } => { + if let Some(crs) = crs { + format!("GEOMETRY({crs})") + } else { + "GEOMETRY".to_string() + } + } LogicalType::Geography { crs, algorithm } => { - format!("GEOGRAPHY({crs:?},{algorithm:?})") + let algorithm = algorithm.unwrap_or_default(); + if let Some(crs) = crs { + format!("GEOGRAPHY({algorithm}, {crs})") + } else { + format!("GEOGRAPHY({algorithm})") + } } LogicalType::Unknown => "UNKNOWN".to_string(), LogicalType::_Unknown { field_id } => format!("_Unknown({field_id})"), @@ -454,7 +465,7 @@ mod tests { use std::sync::Arc; - use crate::basic::{Repetition, Type as PhysicalType}; + use crate::basic::{EdgeInterpolationAlgorithm, Repetition, Type as PhysicalType}; use crate::errors::Result; use crate::schema::parser::parse_message_type; @@ -784,6 +795,62 @@ mod tests { .unwrap(), "REQUIRED BYTE_ARRAY field [42] (STRING);", ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geometry { crs: None }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOMETRY);", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geometry { + crs: Some("non-missing CRS".to_string()), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOMETRY(non-missing CRS));", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geography { + crs: None, + algorithm: Some(EdgeInterpolationAlgorithm::default()), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL));", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geography { + crs: Some("non-missing CRS".to_string()), + algorithm: Some(EdgeInterpolationAlgorithm::default()), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL, non-missing CRS));", + ), ]; types_and_strings.into_iter().for_each(|(field, expected)| { diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs new file mode 100644 index 000000000000..b3de40491b30 --- /dev/null +++ b/parquet/tests/geospatial.rs @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for Geometry and Geography logical types +use parquet::{ + basic::{EdgeInterpolationAlgorithm, LogicalType}, + file::{ + metadata::ParquetMetaData, + reader::{FileReader, SerializedFileReader}, + }, + geospatial::bounding_box::BoundingBox, +}; +use serde_json::Value; +use std::fs::File; + +fn read_metadata(geospatial_test_file: &str) -> ParquetMetaData { + let path = format!( + "{}/geospatial/{geospatial_test_file}", + arrow::util::test_util::parquet_test_data(), + ); + let file = File::open(path).unwrap(); + let reader = SerializedFileReader::try_from(file).unwrap(); + reader.metadata().clone() +} + +#[test] +fn test_read_logical_type() { + // Some crs values are short strings + let expected_logical_type = [ + ("crs-default.parquet", LogicalType::Geometry { crs: None }), + ( + "crs-srid.parquet", + LogicalType::Geometry { + crs: Some("srid:5070".to_string()), + }, + ), + ( + "crs-projjson.parquet", + LogicalType::Geometry { + crs: Some("projjson:projjson_epsg_5070".to_string()), + }, + ), + ( + "crs-geography.parquet", + LogicalType::Geography { + crs: None, + algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), + }, + ), + ]; + + for (geospatial_file, expected_type) in expected_logical_type { + let metadata = read_metadata(geospatial_file); + let logical_type = metadata + .file_metadata() + .schema_descr() + .column(1) + .logical_type() + .unwrap(); + + assert_eq!(logical_type, expected_type); + } + + // The crs value may also contain arbitrary values (in this case some JSON + // a bit too lengthy to type out) + let metadata = read_metadata("crs-arbitrary-value.parquet"); + let logical_type = metadata + .file_metadata() + .schema_descr() + .column(1) + .logical_type() + .unwrap(); + + if let LogicalType::Geometry { crs } = logical_type { + let crs_parsed: Value = serde_json::from_str(&crs.unwrap()).unwrap(); + assert_eq!(crs_parsed.get("id").unwrap().get("code").unwrap(), 5070); + } else { + panic!("Expected geometry type but got {logical_type:?}"); + } +} + +#[test] +fn test_read_geospatial_statistics() { + let metadata = read_metadata("geospatial.parquet"); + + // geospatial.parquet schema: + // optional binary field_id=-1 group (String); + // optional binary field_id=-1 wkt (String); + // optional binary field_id=-1 geometry (Geometry(crs=)); + let fields = metadata.file_metadata().schema().get_fields(); + let logical_type = fields[2].get_basic_info().logical_type().unwrap(); + assert_eq!(logical_type, LogicalType::Geometry { crs: None }); + + let geo_statistics = metadata.row_group(0).column(2).geo_statistics(); + assert!(geo_statistics.is_some()); + + let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0) + .with_zrange(30.0, 80.0) + .with_mrange(200.0, 1600.0); + let expected_geospatial_types = vec![ + 1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 2001, 2002, 2003, 2004, + 2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007, + ]; + assert_eq!( + geo_statistics.unwrap().geospatial_types(), + Some(&expected_geospatial_types) + ); + assert_eq!(geo_statistics.unwrap().bounding_box(), Some(&expected_bbox)); +}