-
Notifications
You must be signed in to change notification settings - Fork 1k
[thrift-remodel] Incorporate changes made to geospatial statistics #8528
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 218 commits
e3a0b50
71d3859
ff42e5a
1f2c216
37f3b20
3d4e28e
2b85b89
5ee1b8f
624b88b
d99a06a
79a6917
878d460
009632a
998ac6c
a822dfd
20df075
7755b7b
66ed8bc
f6c5738
3733b86
09d71e1
1ddaa35
006d59d
c271085
34cdaf2
c9be570
a9cd09d
ae65167
272a013
632e171
9f01b60
2511f8f
61e9e07
e39f119
def3d07
386f222
7ae2304
6beb79d
1eaa17b
713e38a
79e8f85
b31c9e6
8c4e49d
e0e1852
1ebfdf2
366326a
7b8777a
d8081a9
5d6c8b1
709e813
def1d68
0579456
c1587c4
04b74f5
2250e18
6af8631
3775222
85f44a5
f0e538f
763ecd7
734ee9b
5569757
23636c9
179bb21
4f7bd62
51cf33a
b4ca56e
c702a44
0893ec7
689297c
7d47857
b543838
99ee049
f158d72
56f5c5d
b37029e
086d04c
ecd24de
1e510bc
138b0d5
5b6c177
88959be
9fe5a9a
52d73e9
29091cd
d13463a
ee810e1
3afcfac
486d851
3092ede
da66845
9ab7bb0
544eca0
0b33d25
39a9169
8de96ce
683d4e4
c729d22
96419c4
6ec102f
976b36d
ceac418
e10e274
ffeaa7a
e73a922
f81a732
be58ea6
02e5e16
61aa392
428e84c
751b0f1
a1cfbec
7268dd3
8305915
4221646
4342cb5
ddbeb55
8919c82
7112088
f0beb0b
b303e52
2955b85
3d33707
45fa0f4
07157ec
cfa6740
6c82028
b16e118
1afd866
29ddbc7
65e42d7
82f31a4
608c0f3
237ca3d
bdb9aa9
15ed645
2dcd8d0
2091e49
4da5d9e
afb4adf
9909d0c
418e45c
023f5d7
e0deed9
218b42b
67a82f4
1c71b42
49813ea
5298257
fd63d32
72ea850
ebae0af
7560e70
e94a2de
1ff8b88
b7c64ca
8d19468
56a75d6
b7a135b
7b549f9
a6ca284
dde0770
344ad12
943c674
b9e97c5
db2115a
c95ff97
0701d60
7fb0e13
e8dde76
9ebb8b1
4977f2f
2238925
cbf1624
7ec64a9
a87b0a2
5fd7a8f
1334370
5c5c826
737f653
facd852
2d789fd
f82fd45
1374686
bd682d1
3b8de59
1bca0a0
01dc4f3
c3907dc
9045533
10427c8
653fa1a
91e3df7
7f03758
4b8c68b
80fc032
61773a0
e34d362
a496854
a6a6326
b5651e5
ec73f7a
282a925
36c1dc1
e623a56
f6be170
8454d50
605292b
cba5d3d
e58c955
e6d80f7
ef5ef6d
0ba2bcb
70efc43
26108c0
bb5b688
6febae0
49f3957
793db5b
d8076c2
c37bce2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -349,15 +349,16 @@ pub enum LogicalType { | |
}, | ||
/// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation. | ||
Geometry { | ||
/// A custom CRS. If unset the defaults to `OGC:CRS84`. | ||
/// A custom CRS. If unset the defaults to `OGC:CRS84`, which means that the geometries | ||
/// must be stored in longitude, latitude based on the WGS84 datum. | ||
crs: Option<String>, | ||
}, | ||
/// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation. | ||
Geography { | ||
/// A custom CRS. If unset the defaults to `OGC:CRS84`. | ||
crs: Option<String>, | ||
/// An optional algorithm can be set to correctly interpret edges interpolation | ||
/// of the geometries. If unset, the algorithm defaults to `SPHERICAL``. | ||
/// of the geometries. If unset, the algorithm defaults to `SPHERICAL`. | ||
algorithm: Option<EdgeInterpolationAlgorithm>, | ||
}, | ||
/// For forward compatibility; used when an unknown union value is encountered. | ||
|
@@ -456,9 +457,10 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType { | |
} | ||
18 => { | ||
let val = GeographyType::read_thrift(&mut *prot)?; | ||
let algorithm = val.algorithm.unwrap_or_default(); | ||
Self::Geography { | ||
crs: val.crs.map(|s| s.to_owned()), | ||
algorithm: val.algorithm, | ||
algorithm: Some(algorithm), | ||
} | ||
} | ||
_ => { | ||
|
@@ -928,17 +930,31 @@ enum BoundaryOrder { | |
// ---------------------------------------------------------------------- | ||
// Mirrors thrift enum `EdgeInterpolationAlgorithm` | ||
|
||
// TODO(ets): we need to allow for unknown variants. Either hand code this one, or add a new | ||
// macro that adds an _Unknown variant. | ||
|
||
|
||
thrift_enum!( | ||
/// Edge interpolation algorithm for Geography logical type | ||
enum EdgeInterpolationAlgorithm { | ||
/// Edges are interpolated as geodesics on a sphere. | ||
SPHERICAL = 0; | ||
/// <https://en.wikipedia.org/wiki/Vincenty%27s_formulae> | ||
VINCENTY = 1; | ||
/// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970 | ||
THOMAS = 2; | ||
/// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. | ||
ANDOYER = 3; | ||
/// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55 | ||
KARNEY = 4; | ||
} | ||
); | ||
|
||
impl Default for EdgeInterpolationAlgorithm { | ||
fn default() -> Self { | ||
Self::SPHERICAL | ||
} | ||
} | ||
|
||
// ---------------------------------------------------------------------- | ||
// Mirrors thrift union `BloomFilterAlgorithm` | ||
|
||
|
@@ -1359,7 +1375,7 @@ impl str::FromStr for LogicalType { | |
"GEOMETRY" => Ok(LogicalType::Geometry { crs: None }), | ||
"GEOGRAPHY" => Ok(LogicalType::Geography { | ||
crs: None, | ||
algorithm: None, | ||
algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), | ||
}), | ||
other => Err(general_err!("Invalid parquet logical type {}", other)), | ||
} | ||
|
@@ -1816,6 +1832,17 @@ mod tests { | |
ConvertedType::from(Some(LogicalType::Float16)), | ||
ConvertedType::NONE | ||
); | ||
assert_eq!( | ||
ConvertedType::from(Some(LogicalType::Geometry { crs: None })), | ||
ConvertedType::NONE | ||
); | ||
assert_eq!( | ||
ConvertedType::from(Some(LogicalType::Geography { | ||
crs: None, | ||
algorithm: Some(EdgeInterpolationAlgorithm::default()), | ||
})), | ||
ConvertedType::NONE | ||
); | ||
assert_eq!( | ||
ConvertedType::from(Some(LogicalType::Unknown)), | ||
ConvertedType::NONE | ||
|
@@ -1897,11 +1924,11 @@ mod tests { | |
}); | ||
test_roundtrip(LogicalType::Geography { | ||
crs: Some("foo".to_owned()), | ||
algorithm: None, | ||
algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), | ||
}); | ||
test_roundtrip(LogicalType::Geography { | ||
crs: None, | ||
algorithm: None, | ||
algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), | ||
}); | ||
} | ||
|
||
|
@@ -2113,7 +2140,15 @@ mod tests { | |
check_sort_order(signed, SortOrder::SIGNED); | ||
|
||
// Undefined comparison | ||
let undefined = vec![LogicalType::List, LogicalType::Map]; | ||
let undefined = vec![ | ||
LogicalType::List, | ||
LogicalType::Map, | ||
LogicalType::Geometry { crs: None }, | ||
LogicalType::Geography { | ||
crs: None, | ||
algorithm: Some(EdgeInterpolationAlgorithm::default()), | ||
}, | ||
]; | ||
check_sort_order(undefined, SortOrder::UNDEFINED); | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -329,9 +329,20 @@ fn print_logical_and_converted( | |
LogicalType::Variant { | ||
specification_version, | ||
} => format!("VARIANT({specification_version:?})"), | ||
LogicalType::Geometry { crs } => format!("GEOMETRY({crs:?})"), | ||
LogicalType::Geometry { crs } => { | ||
if let Some(crs) = crs { | ||
format!("GEOMETRY({crs})") | ||
} else { | ||
"GEOMETRY".to_string() | ||
} | ||
} | ||
LogicalType::Geography { crs, algorithm } => { | ||
format!("GEOGRAPHY({crs:?},{algorithm:?})") | ||
let algorithm = algorithm.unwrap_or_default(); | ||
if let Some(crs) = crs { | ||
format!("GEOGRAPHY({algorithm}, {crs})") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just pointing out that the order of formatting was switched from crs-algorithm to algorithm-crs. I think this is a beneficial change because the CRS could be a long string. |
||
} else { | ||
format!("GEOGRAPHY({algorithm})") | ||
} | ||
} | ||
LogicalType::Unknown => "UNKNOWN".to_string(), | ||
LogicalType::_Unknown { field_id } => format!("_Unknown({field_id})"), | ||
|
@@ -454,7 +465,7 @@ mod tests { | |
|
||
use std::sync::Arc; | ||
|
||
use crate::basic::{Repetition, Type as PhysicalType}; | ||
use crate::basic::{EdgeInterpolationAlgorithm, Repetition, Type as PhysicalType}; | ||
use crate::errors::Result; | ||
use crate::schema::parser::parse_message_type; | ||
|
||
|
@@ -784,6 +795,62 @@ mod tests { | |
.unwrap(), | ||
"REQUIRED BYTE_ARRAY field [42] (STRING);", | ||
), | ||
( | ||
build_primitive_type( | ||
"field", | ||
None, | ||
PhysicalType::BYTE_ARRAY, | ||
Some(LogicalType::Geometry { crs: None }), | ||
ConvertedType::NONE, | ||
Repetition::REQUIRED, | ||
) | ||
.unwrap(), | ||
"REQUIRED BYTE_ARRAY field (GEOMETRY);", | ||
), | ||
( | ||
build_primitive_type( | ||
"field", | ||
None, | ||
PhysicalType::BYTE_ARRAY, | ||
Some(LogicalType::Geometry { | ||
crs: Some("non-missing CRS".to_string()), | ||
}), | ||
ConvertedType::NONE, | ||
Repetition::REQUIRED, | ||
) | ||
.unwrap(), | ||
"REQUIRED BYTE_ARRAY field (GEOMETRY(non-missing CRS));", | ||
), | ||
( | ||
build_primitive_type( | ||
"field", | ||
None, | ||
PhysicalType::BYTE_ARRAY, | ||
Some(LogicalType::Geography { | ||
crs: None, | ||
algorithm: Some(EdgeInterpolationAlgorithm::default()), | ||
}), | ||
ConvertedType::NONE, | ||
Repetition::REQUIRED, | ||
) | ||
.unwrap(), | ||
"REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL));", | ||
), | ||
( | ||
build_primitive_type( | ||
"field", | ||
None, | ||
PhysicalType::BYTE_ARRAY, | ||
Some(LogicalType::Geography { | ||
crs: Some("non-missing CRS".to_string()), | ||
algorithm: Some(EdgeInterpolationAlgorithm::default()), | ||
}), | ||
ConvertedType::NONE, | ||
Repetition::REQUIRED, | ||
) | ||
.unwrap(), | ||
"REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL, non-missing CRS));", | ||
), | ||
]; | ||
|
||
types_and_strings.into_iter().for_each(|(field, expected)| { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! Tests for Geometry and Geography logical types | ||
use parquet::{ | ||
basic::{EdgeInterpolationAlgorithm, LogicalType}, | ||
file::{ | ||
metadata::ParquetMetaData, | ||
reader::{FileReader, SerializedFileReader}, | ||
}, | ||
geospatial::bounding_box::BoundingBox, | ||
}; | ||
use serde_json::Value; | ||
use std::fs::File; | ||
|
||
fn read_metadata(geospatial_test_file: &str) -> ParquetMetaData { | ||
let path = format!( | ||
"{}/geospatial/{geospatial_test_file}", | ||
arrow::util::test_util::parquet_test_data(), | ||
); | ||
let file = File::open(path).unwrap(); | ||
let reader = SerializedFileReader::try_from(file).unwrap(); | ||
reader.metadata().clone() | ||
} | ||
|
||
#[test] | ||
fn test_read_logical_type() { | ||
// Some crs values are short strings | ||
let expected_logical_type = [ | ||
("crs-default.parquet", LogicalType::Geometry { crs: None }), | ||
( | ||
"crs-srid.parquet", | ||
LogicalType::Geometry { | ||
crs: Some("srid:5070".to_string()), | ||
}, | ||
), | ||
( | ||
"crs-projjson.parquet", | ||
LogicalType::Geometry { | ||
crs: Some("projjson:projjson_epsg_5070".to_string()), | ||
}, | ||
), | ||
( | ||
"crs-geography.parquet", | ||
LogicalType::Geography { | ||
crs: None, | ||
algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL), | ||
}, | ||
), | ||
]; | ||
|
||
for (geospatial_file, expected_type) in expected_logical_type { | ||
let metadata = read_metadata(geospatial_file); | ||
let logical_type = metadata | ||
.file_metadata() | ||
.schema_descr() | ||
.column(1) | ||
.logical_type() | ||
.unwrap(); | ||
|
||
assert_eq!(logical_type, expected_type); | ||
} | ||
|
||
// The crs value may also contain arbitrary values (in this case some JSON | ||
// a bit too lengthy to type out) | ||
let metadata = read_metadata("crs-arbitrary-value.parquet"); | ||
let logical_type = metadata | ||
.file_metadata() | ||
.schema_descr() | ||
.column(1) | ||
.logical_type() | ||
.unwrap(); | ||
|
||
if let LogicalType::Geometry { crs } = logical_type { | ||
let crs_parsed: Value = serde_json::from_str(&crs.unwrap()).unwrap(); | ||
assert_eq!(crs_parsed.get("id").unwrap().get("code").unwrap(), 5070); | ||
} else { | ||
panic!("Expected geometry type but got {logical_type:?}"); | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_read_geospatial_statistics() { | ||
let metadata = read_metadata("geospatial.parquet"); | ||
|
||
// geospatial.parquet schema: | ||
// optional binary field_id=-1 group (String); | ||
// optional binary field_id=-1 wkt (String); | ||
// optional binary field_id=-1 geometry (Geometry(crs=)); | ||
let fields = metadata.file_metadata().schema().get_fields(); | ||
let logical_type = fields[2].get_basic_info().logical_type().unwrap(); | ||
assert_eq!(logical_type, LogicalType::Geometry { crs: None }); | ||
|
||
let geo_statistics = metadata.row_group(0).column(2).geo_statistics(); | ||
assert!(geo_statistics.is_some()); | ||
|
||
let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0) | ||
.with_zrange(30.0, 80.0) | ||
.with_mrange(200.0, 1600.0); | ||
let expected_geospatial_types = vec![ | ||
1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 2001, 2002, 2003, 2004, | ||
2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007, | ||
]; | ||
assert_eq!( | ||
geo_statistics.unwrap().geospatial_types(), | ||
Some(&expected_geospatial_types) | ||
); | ||
assert_eq!(geo_statistics.unwrap().bounding_box(), Some(&expected_bbox)); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change gives me the most pause. If the value is not set in the thrift, do we really want to set a default or leave it unset and handle that downstream?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't mind either way...the interpretation of "the default" is part of the Parquet standard (if unset in Thrift, the interpretation is spherical), my thought was that this would make it so that others don't have to read the spec in order to do the right thing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, guess I should have read the spec 😅. I'll leave this as is then. @alamb do you have an opinion here? (relevent section of the spec is here).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree this seems pretty clear cut:
Maybe we could change this to explicitly name SPHERICAL and reference the spec, something like