Skip to content
Merged
Show file tree
Hide file tree
Changes from 218 commits
Commits
Show all changes
221 commits
Select commit Hold shift + click to select a range
e3a0b50
custom PageLocation decoder for speed
etseidl Aug 20, 2025
71d3859
fix recently added test
etseidl Aug 20, 2025
ff42e5a
clippy
etseidl Aug 20, 2025
1f2c216
experimental new form for column index
etseidl Aug 20, 2025
37f3b20
fix for test added in main
etseidl Aug 21, 2025
3d4e28e
refactor new column index
etseidl Aug 21, 2025
2b85b89
checkpoint...everything but stats converter
etseidl Aug 21, 2025
5ee1b8f
fix bug found in testing
etseidl Aug 21, 2025
624b88b
Merge branch 'new_col_idx' into new_col_idx_full
etseidl Aug 21, 2025
d99a06a
stats converter works
etseidl Aug 22, 2025
79a6917
get rid of import
etseidl Aug 22, 2025
878d460
get parquet-index working
etseidl Aug 22, 2025
009632a
doc fixes
etseidl Aug 22, 2025
998ac6c
Merge branch 'offset_idx_speedup' into new_col_idx_full
etseidl Aug 22, 2025
a822dfd
move column index to its own module
etseidl Aug 22, 2025
20df075
add ColumnIndexIterators trait, simplify stats converter a little
etseidl Aug 22, 2025
7755b7b
restore comment
etseidl Aug 22, 2025
66ed8bc
Merge branch 'new_col_idx' into new_col_idx_full
etseidl Aug 22, 2025
f6c5738
further rework...allow for fallback to slow decoder
etseidl Aug 24, 2025
3733b86
Merge branch 'offset_idx_speedup' into new_col_idx_full
etseidl Aug 24, 2025
09d71e1
refactor a bit
etseidl Aug 24, 2025
1ddaa35
simplify reading of int array
etseidl Aug 24, 2025
006d59d
Merge branch 'offset_idx_speedup' into new_col_idx_full
etseidl Aug 24, 2025
c271085
get write working for enum and some unions
etseidl Aug 25, 2025
34cdaf2
make test_roundtrip visible
etseidl Aug 25, 2025
c9be570
add test for converted_type, start on logical_type
etseidl Aug 25, 2025
a9cd09d
checkpoint struct field writing
etseidl Aug 25, 2025
ae65167
get some struct examples and lists working
etseidl Aug 25, 2025
272a013
get rid of copied allow
etseidl Aug 25, 2025
632e171
get writer macros for structs working
etseidl Aug 26, 2025
9f01b60
fix bug in struct macro
etseidl Aug 26, 2025
2511f8f
make Repetition public
etseidl Aug 26, 2025
61e9e07
get union working for writes
etseidl Aug 26, 2025
e39f119
add some tests
etseidl Aug 26, 2025
def3d07
redo OrderedF64 initialization
etseidl Aug 26, 2025
386f222
unused import
etseidl Aug 26, 2025
7ae2304
Merge branch 'gh5854_thrift_remodel' into write_thrift
etseidl Aug 26, 2025
6beb79d
get decryption working
etseidl Aug 26, 2025
1eaa17b
refactor and clippy fixes
etseidl Aug 26, 2025
713e38a
add page header defs
etseidl Aug 26, 2025
79e8f85
totally rework the input side
etseidl Aug 27, 2025
b31c9e6
rework struct field reading
etseidl Aug 27, 2025
8c4e49d
fix skipping bool fields
etseidl Aug 27, 2025
e0e1852
remove cruft
etseidl Aug 27, 2025
1ebfdf2
Merge branch 'gh5854_thrift_remodel' into write_thrift
etseidl Aug 27, 2025
366326a
Merge branch 'write_thrift' into read_and_crypto
etseidl Aug 27, 2025
7b8777a
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Aug 27, 2025
d8081a9
fix clippy issues
etseidl Aug 28, 2025
5d6c8b1
allow unused page header structs
etseidl Aug 28, 2025
709e813
remove Write from WriteThrift
etseidl Aug 29, 2025
def1d68
Merge branch 'write_thrift' into read_and_crypto
etseidl Aug 29, 2025
0579456
finish merge
etseidl Aug 29, 2025
c1587c4
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Aug 29, 2025
04b74f5
stats
etseidl Aug 29, 2025
2250e18
get new page headers working for read and write
etseidl Aug 29, 2025
6af8631
rename page header structs
etseidl Aug 29, 2025
3775222
add some fixmes
etseidl Aug 29, 2025
85f44a5
formatting
etseidl Aug 29, 2025
f0e538f
test results differ depending on features
etseidl Aug 29, 2025
763ecd7
error rather than panic on missing required fields
etseidl Aug 29, 2025
734ee9b
add option to read page stats
etseidl Aug 29, 2025
5569757
add comments
etseidl Aug 29, 2025
23636c9
clippy
etseidl Aug 29, 2025
179bb21
switch page header bench to new code
etseidl Aug 29, 2025
4f7bd62
add comment
etseidl Aug 29, 2025
51cf33a
benchmark changes
etseidl Aug 29, 2025
b4ca56e
update benchmarks to match thrift-remodel feature branch
etseidl Aug 29, 2025
c702a44
add encoding_stats to wide data set
etseidl Aug 30, 2025
0893ec7
clippy
etseidl Aug 30, 2025
689297c
Merge branch 'gh5854_thrift_remodel' into write_thrift
etseidl Aug 30, 2025
7d47857
Merge branch 'write_thrift' into read_and_crypto
etseidl Aug 30, 2025
b543838
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Aug 30, 2025
99ee049
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Aug 30, 2025
f158d72
Merge branch 'update_metadata_bench' into read_page_header
etseidl Aug 30, 2025
56f5c5d
remove dup from merge
etseidl Sep 4, 2025
b37029e
checkpoint offset index
etseidl Sep 5, 2025
086d04c
write path for column index
etseidl Sep 5, 2025
ecd24de
copy over tests from index
etseidl Sep 5, 2025
1e510bc
remove index module
etseidl Sep 5, 2025
138b0d5
Merge branch 'gh5854_thrift_remodel' into write_thrift
etseidl Sep 5, 2025
5b6c177
Merge branch 'write_thrift' into read_and_crypto
etseidl Sep 5, 2025
88959be
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 5, 2025
9fe5a9a
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 5, 2025
52d73e9
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 5, 2025
29091cd
refactor column index building
etseidl Sep 5, 2025
d13463a
checkpoint
etseidl Sep 5, 2025
ee810e1
checkpoint encrypt column meta
etseidl Sep 6, 2025
3afcfac
checkpoint...write code finished
etseidl Sep 7, 2025
486d851
checkpoint...almost works
etseidl Sep 8, 2025
3092ede
some test fixes and cleanup
etseidl Sep 8, 2025
da66845
more fixes and cleanup
etseidl Sep 8, 2025
9ab7bb0
clippy fixes
etseidl Sep 8, 2025
544eca0
start removing references to format
etseidl Sep 8, 2025
0b33d25
more format cleanup
etseidl Sep 8, 2025
39a9169
remove format references from docs
etseidl Sep 8, 2025
8de96ce
remove format conversion functions
etseidl Sep 8, 2025
683d4e4
remove format::CompressionCodec
etseidl Sep 8, 2025
c729d22
Merge remote-tracking branch 'origin/gh5854_thrift_remodel' into writ…
etseidl Sep 8, 2025
96419c4
Merge branch 'write_thrift' into read_and_crypto
etseidl Sep 8, 2025
6ec102f
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 8, 2025
976b36d
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 8, 2025
ceac418
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 8, 2025
e10e274
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 8, 2025
ffeaa7a
Merge branch 'write_thrift' into remove_format
etseidl Sep 8, 2025
e73a922
remove format from statistics
etseidl Sep 9, 2025
f81a732
get a start on some documentation and add some TODOs
etseidl Sep 10, 2025
be58ea6
Merge branch 'write_thrift' into read_and_crypto
etseidl Sep 10, 2025
02e5e16
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 10, 2025
61aa392
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 10, 2025
428e84c
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 10, 2025
751b0f1
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 10, 2025
a1cfbec
Merge branch 'write_file_meta' into remove_format
etseidl Sep 10, 2025
7268dd3
fix docs
etseidl Sep 10, 2025
8305915
Merge branch 'write_thrift' into read_and_crypto
etseidl Sep 10, 2025
4221646
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 10, 2025
4342cb5
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 10, 2025
ddbeb55
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 10, 2025
8919c82
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 10, 2025
7112088
Merge branch 'write_file_meta' into remove_format
etseidl Sep 10, 2025
f0beb0b
Merge branch 'gh5854_thrift_remodel' into read_and_crypto
etseidl Sep 10, 2025
b303e52
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 10, 2025
2955b85
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 10, 2025
3d33707
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 10, 2025
45fa0f4
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 10, 2025
07157ec
Merge branch 'write_file_meta' into remove_format
etseidl Sep 10, 2025
cfa6740
backport fix for tests without encryption
etseidl Sep 10, 2025
6c82028
Merge branch 'read_and_crypto' into rework_thrift_reader
etseidl Sep 10, 2025
b16e118
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 10, 2025
1afd866
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 10, 2025
29ddbc7
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 10, 2025
65e42d7
Merge branch 'write_file_meta' into remove_format
etseidl Sep 10, 2025
82f31a4
add documentation
etseidl Sep 11, 2025
608c0f3
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 11, 2025
237ca3d
add docs for ThriftReadInputProtocol
etseidl Sep 11, 2025
bdb9aa9
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 11, 2025
15ed645
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 11, 2025
2dcd8d0
Merge branch 'write_file_meta' into remove_format
etseidl Sep 11, 2025
2091e49
move PageEncodingStats to thrift_gen
etseidl Sep 11, 2025
4da5d9e
Merge branch 'gh5854_thrift_remodel' into rework_thrift_reader
etseidl Sep 12, 2025
afb4adf
Merge branch 'rework_thrift_reader' into read_page_header
etseidl Sep 12, 2025
9909d0c
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 12, 2025
418e45c
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 12, 2025
023f5d7
Merge branch 'write_file_meta' into remove_format
etseidl Sep 12, 2025
e0deed9
move PageEncodingStats to mod.rs since it needs to be public
etseidl Sep 15, 2025
218b42b
cleanup and add some documentation
etseidl Sep 16, 2025
67a82f4
start on documentation of thrift processing
etseidl Sep 16, 2025
1c71b42
more docs
etseidl Sep 16, 2025
49813ea
finish first cut of THRIFT.md
etseidl Sep 16, 2025
5298257
clean up some stale documentation references
etseidl Sep 16, 2025
fd63d32
add a todo
etseidl Sep 16, 2025
72ea850
more doc cleanup
etseidl Sep 16, 2025
ebae0af
Merge branch 'gh5854_thrift_remodel' into read_page_header
etseidl Sep 17, 2025
7560e70
fix typo
etseidl Sep 17, 2025
e94a2de
fix typo
etseidl Sep 17, 2025
1ff8b88
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 17, 2025
b7c64ca
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 17, 2025
8d19468
Merge branch 'write_file_meta' into remove_format
etseidl Sep 17, 2025
56a75d6
clean up some imports
etseidl Sep 17, 2025
b7a135b
increment shift after test
etseidl Sep 18, 2025
7b549f9
update docs for PageStatistics
etseidl Sep 23, 2025
a6ca284
Merge branch 'read_page_header' into write_page_indexes
etseidl Sep 23, 2025
dde0770
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 23, 2025
344ad12
Merge branch 'write_file_meta' into remove_format
etseidl Sep 23, 2025
943c674
Merge remote-tracking branch 'origin/gh5854_thrift_remodel' into writ…
etseidl Sep 23, 2025
b9e97c5
Merge branch 'gh5854_thrift_remodel' into write_page_indexes
etseidl Sep 23, 2025
db2115a
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 23, 2025
c95ff97
Merge branch 'write_file_meta' into remove_format
etseidl Sep 23, 2025
0701d60
backport some doc fixes
etseidl Sep 23, 2025
7fb0e13
Merge branch 'write_page_indexes' into write_file_meta
etseidl Sep 23, 2025
e8dde76
Merge branch 'write_file_meta' into remove_format
etseidl Sep 23, 2025
9ebb8b1
fix recently added test
etseidl Sep 23, 2025
4977f2f
fix recently added test
etseidl Sep 23, 2025
2238925
Merge branch 'write_file_meta' into remove_format
etseidl Sep 23, 2025
cbf1624
add TODO
etseidl Sep 23, 2025
7ec64a9
Merge branch 'gh5854_thrift_remodel' into write_file_meta
etseidl Sep 25, 2025
a87b0a2
forgot to check this in during merge
etseidl Sep 25, 2025
5fd7a8f
Merge branch 'write_file_meta' into remove_format
etseidl Sep 25, 2025
1334370
remove TODO
etseidl Sep 25, 2025
5c5c826
add HeapSize for crypto fields on chunk metadata
etseidl Sep 25, 2025
737f653
Merge branch 'write_file_meta' into remove_format
etseidl Sep 25, 2025
facd852
Merge branch 'gh5854_thrift_remodel' into write_file_meta
etseidl Sep 25, 2025
2d789fd
Merge branch 'write_file_meta' into remove_format
etseidl Sep 25, 2025
f82fd45
Merge branch 'gh5854_thrift_remodel' into write_file_meta
etseidl Sep 25, 2025
1374686
Merge branch 'write_file_meta' into remove_format
etseidl Sep 25, 2025
bd682d1
Merge branch 'gh5854_thrift_remodel' into write_file_meta
etseidl Sep 25, 2025
3b8de59
Merge branch 'write_file_meta' into remove_format
etseidl Sep 25, 2025
1bca0a0
remove unnecessary checks
etseidl Sep 25, 2025
01dc4f3
Merge branch 'write_file_meta' into remove_format
etseidl Sep 25, 2025
c3907dc
implement suggestions from review
etseidl Sep 26, 2025
9045533
Merge branch 'gh5854_thrift_remodel' into write_file_meta
etseidl Sep 26, 2025
10427c8
Merge branch 'write_file_meta' into remove_format
etseidl Sep 26, 2025
653fa1a
remove TODO
etseidl Sep 26, 2025
91e3df7
more todos
etseidl Sep 26, 2025
7f03758
Merge branch 'gh5854_thrift_remodel' into remove_format
etseidl Sep 26, 2025
4b8c68b
variant logical type fixes
etseidl Sep 26, 2025
80fc032
remove lint
etseidl Sep 26, 2025
61773a0
remove private APIs from metadata benchmark
etseidl Sep 26, 2025
e34d362
merge in changes to benchmark
etseidl Sep 26, 2025
a496854
Merge branch 'gh5854_thrift_remodel' into remove_format
etseidl Sep 26, 2025
a6a6326
Apply suggestions from code review
etseidl Sep 29, 2025
b5651e5
add test of invalid converted type
etseidl Sep 29, 2025
ec73f7a
Merge branch 'remove_format' of github.com:etseidl/arrow-rs into remo…
etseidl Sep 29, 2025
282a925
use raw identifier for 'type' in SchemaElement
etseidl Sep 29, 2025
36c1dc1
Merge branch 'gh5854_thrift_remodel' into remove_format
etseidl Sep 29, 2025
e623a56
finish merge
etseidl Sep 29, 2025
f6be170
fix test
etseidl Sep 29, 2025
8454d50
some doc changes from review comments
etseidl Sep 29, 2025
605292b
Merge branch 'gh5854_thrift_remodel' into remove_format
etseidl Sep 30, 2025
cba5d3d
rename more type_ fields as r#type
etseidl Sep 30, 2025
e58c955
clean up parquet_thrift
etseidl Sep 30, 2025
e6d80f7
make file_path match with/without encryption
etseidl Sep 30, 2025
ef5ef6d
clean up some docs
etseidl Sep 30, 2025
0ba2bcb
refactor parser to cluster more encryption specific code
etseidl Sep 30, 2025
70efc43
remove a few allocations
etseidl Oct 1, 2025
26108c0
Merge branch 'gh5854_thrift_remodel' into remove_format
etseidl Oct 1, 2025
bb5b688
remove TODO
etseidl Oct 1, 2025
6febae0
merge in changes to geo spatial stats
etseidl Oct 1, 2025
49f3957
Merge branch 'gh5854_thrift_remodel' into merge_geo_spatial
etseidl Oct 1, 2025
793db5b
allow for unknown variants and some doc cleanups
etseidl Oct 1, 2025
d8076c2
Merge remote-tracking branch 'origin/gh5854_thrift_remodel' into merg…
etseidl Oct 1, 2025
c37bce2
clean up leftover #allow
etseidl Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 42 additions & 7 deletions parquet/src/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -349,15 +349,16 @@ pub enum LogicalType {
},
/// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation.
Geometry {
/// A custom CRS. If unset the defaults to `OGC:CRS84`.
/// A custom CRS. If unset the defaults to `OGC:CRS84`, which means that the geometries
/// must be stored in longitude, latitude based on the WGS84 datum.
crs: Option<String>,
},
/// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation.
Geography {
/// A custom CRS. If unset the defaults to `OGC:CRS84`.
crs: Option<String>,
/// An optional algorithm can be set to correctly interpret edges interpolation
/// of the geometries. If unset, the algorithm defaults to `SPHERICAL``.
/// of the geometries. If unset, the algorithm defaults to `SPHERICAL`.
algorithm: Option<EdgeInterpolationAlgorithm>,
},
/// For forward compatibility; used when an unknown union value is encountered.
Expand Down Expand Up @@ -456,9 +457,10 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType {
}
18 => {
let val = GeographyType::read_thrift(&mut *prot)?;
let algorithm = val.algorithm.unwrap_or_default();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change gives me the most pause. If the value is not set in the thrift, do we really want to set a default or leave it unset and handle that downstream?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't mind either way...the interpretation of "the default" is part of the Parquet standard (if unset in Thrift, the interpretation is spherical), my thought was that this would make it so that others don't have to read the spec in order to do the right thing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, guess I should have read the spec 😅. I'll leave this as is then. @alamb do you have an opinion here? (relevent section of the spec is here).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, guess I should have read the spec 😅. I'll leave this as is then. @alamb do you have an opinion here? (relevent section of the spec is here).

I agree this seems pretty clear cut:

If unset, the algorithm defaults to SPHERICAL.

Maybe we could change this to explicitly name SPHERICAL and reference the spec, something like

Suggested change
let algorithm = val.algorithm.unwrap_or_default();
let algorithm = val.algorithm
// unset algorithm means spherical, per the spec:
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#geography
.unwrap_or(EdgeInterpolationAlgorithm::Spherical)

Self::Geography {
crs: val.crs.map(|s| s.to_owned()),
algorithm: val.algorithm,
algorithm: Some(algorithm),
}
}
_ => {
Expand Down Expand Up @@ -928,17 +930,31 @@ enum BoundaryOrder {
// ----------------------------------------------------------------------
// Mirrors thrift enum `EdgeInterpolationAlgorithm`

// TODO(ets): we need to allow for unknown variants. Either hand code this one, or add a new
// macro that adds an _Unknown variant.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I'm assuming that an unknown algorithm will result in ignoring the stats, so should not be fatal.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The algorithm is taken into account by the writer when writing the stats...statistics for Geography are in theory safe to use for pruning even if the algorithm is unrecognized (although it's difficult to imagine a situation where this would occur except a corrupted file). I put UNKNOWN in the other PR because I couldn't make the From<> implementation infallible without it but perhaps you don't have that constraint here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If stats are robust to unknown algorithms, then I think we should add an _Unknown variant here so older readers can still handle newer files. I'll make that change soon.


thrift_enum!(
/// Edge interpolation algorithm for Geography logical type
enum EdgeInterpolationAlgorithm {
/// Edges are interpolated as geodesics on a sphere.
SPHERICAL = 0;
/// <https://en.wikipedia.org/wiki/Vincenty%27s_formulae>
VINCENTY = 1;
/// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970
THOMAS = 2;
/// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965.
ANDOYER = 3;
/// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55
KARNEY = 4;
}
);

impl Default for EdgeInterpolationAlgorithm {
fn default() -> Self {
Self::SPHERICAL
}
}

// ----------------------------------------------------------------------
// Mirrors thrift union `BloomFilterAlgorithm`

Expand Down Expand Up @@ -1359,7 +1375,7 @@ impl str::FromStr for LogicalType {
"GEOMETRY" => Ok(LogicalType::Geometry { crs: None }),
"GEOGRAPHY" => Ok(LogicalType::Geography {
crs: None,
algorithm: None,
algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
}),
other => Err(general_err!("Invalid parquet logical type {}", other)),
}
Expand Down Expand Up @@ -1816,6 +1832,17 @@ mod tests {
ConvertedType::from(Some(LogicalType::Float16)),
ConvertedType::NONE
);
assert_eq!(
ConvertedType::from(Some(LogicalType::Geometry { crs: None })),
ConvertedType::NONE
);
assert_eq!(
ConvertedType::from(Some(LogicalType::Geography {
crs: None,
algorithm: Some(EdgeInterpolationAlgorithm::default()),
})),
ConvertedType::NONE
);
assert_eq!(
ConvertedType::from(Some(LogicalType::Unknown)),
ConvertedType::NONE
Expand Down Expand Up @@ -1897,11 +1924,11 @@ mod tests {
});
test_roundtrip(LogicalType::Geography {
crs: Some("foo".to_owned()),
algorithm: None,
algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
});
test_roundtrip(LogicalType::Geography {
crs: None,
algorithm: None,
algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
});
}

Expand Down Expand Up @@ -2113,7 +2140,15 @@ mod tests {
check_sort_order(signed, SortOrder::SIGNED);

// Undefined comparison
let undefined = vec![LogicalType::List, LogicalType::Map];
let undefined = vec![
LogicalType::List,
LogicalType::Map,
LogicalType::Geometry { crs: None },
LogicalType::Geography {
crs: None,
algorithm: Some(EdgeInterpolationAlgorithm::default()),
},
];
check_sort_order(undefined, SortOrder::UNDEFINED);
}

Expand Down
2 changes: 1 addition & 1 deletion parquet/src/file/metadata/thrift_gen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1585,7 +1585,7 @@ impl WriteThrift for crate::geospatial::statistics::GeospatialStatistics {

fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
let mut last_field_id = 0i16;
if let Some(bbox) = self.bbox() {
if let Some(bbox) = self.bounding_box() {
last_field_id = bbox.write_thrift_field(writer, 1, last_field_id)?;
}
if let Some(geo_types) = self.geospatial_types() {
Expand Down
6 changes: 3 additions & 3 deletions parquet/src/geospatial/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ impl GeospatialStatistics {
}
}

/// Return the optional `BoundingBox`.
pub fn bbox(&self) -> Option<&BoundingBox> {
/// Optional bounding defining the spatial extent, where `None` represents a lack of information.
pub fn bounding_box(&self) -> Option<&BoundingBox> {
self.bbox.as_ref()
}

/// Return the optional list of geospatial types.
/// Optional list of geometry type identifiers, where `None` represents a lack of information.
pub fn geospatial_types(&self) -> Option<&Vec<i32>> {
self.geospatial_types.as_ref()
}
Expand Down
73 changes: 70 additions & 3 deletions parquet/src/schema/printer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -329,9 +329,20 @@ fn print_logical_and_converted(
LogicalType::Variant {
specification_version,
} => format!("VARIANT({specification_version:?})"),
LogicalType::Geometry { crs } => format!("GEOMETRY({crs:?})"),
LogicalType::Geometry { crs } => {
if let Some(crs) = crs {
format!("GEOMETRY({crs})")
} else {
"GEOMETRY".to_string()
}
}
LogicalType::Geography { crs, algorithm } => {
format!("GEOGRAPHY({crs:?},{algorithm:?})")
let algorithm = algorithm.unwrap_or_default();
if let Some(crs) = crs {
format!("GEOGRAPHY({algorithm}, {crs})")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just pointing out that the order of formatting was switched from crs-algorithm to algorithm-crs. I think this is a beneficial change because the CRS could be a long string.

} else {
format!("GEOGRAPHY({algorithm})")
}
}
LogicalType::Unknown => "UNKNOWN".to_string(),
LogicalType::_Unknown { field_id } => format!("_Unknown({field_id})"),
Expand Down Expand Up @@ -454,7 +465,7 @@ mod tests {

use std::sync::Arc;

use crate::basic::{Repetition, Type as PhysicalType};
use crate::basic::{EdgeInterpolationAlgorithm, Repetition, Type as PhysicalType};
use crate::errors::Result;
use crate::schema::parser::parse_message_type;

Expand Down Expand Up @@ -784,6 +795,62 @@ mod tests {
.unwrap(),
"REQUIRED BYTE_ARRAY field [42] (STRING);",
),
(
build_primitive_type(
"field",
None,
PhysicalType::BYTE_ARRAY,
Some(LogicalType::Geometry { crs: None }),
ConvertedType::NONE,
Repetition::REQUIRED,
)
.unwrap(),
"REQUIRED BYTE_ARRAY field (GEOMETRY);",
),
(
build_primitive_type(
"field",
None,
PhysicalType::BYTE_ARRAY,
Some(LogicalType::Geometry {
crs: Some("non-missing CRS".to_string()),
}),
ConvertedType::NONE,
Repetition::REQUIRED,
)
.unwrap(),
"REQUIRED BYTE_ARRAY field (GEOMETRY(non-missing CRS));",
),
(
build_primitive_type(
"field",
None,
PhysicalType::BYTE_ARRAY,
Some(LogicalType::Geography {
crs: None,
algorithm: Some(EdgeInterpolationAlgorithm::default()),
}),
ConvertedType::NONE,
Repetition::REQUIRED,
)
.unwrap(),
"REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL));",
),
(
build_primitive_type(
"field",
None,
PhysicalType::BYTE_ARRAY,
Some(LogicalType::Geography {
crs: Some("non-missing CRS".to_string()),
algorithm: Some(EdgeInterpolationAlgorithm::default()),
}),
ConvertedType::NONE,
Repetition::REQUIRED,
)
.unwrap(),
"REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL, non-missing CRS));",
),
];

types_and_strings.into_iter().for_each(|(field, expected)| {
Expand Down
123 changes: 123 additions & 0 deletions parquet/tests/geospatial.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Tests for Geometry and Geography logical types
use parquet::{
basic::{EdgeInterpolationAlgorithm, LogicalType},
file::{
metadata::ParquetMetaData,
reader::{FileReader, SerializedFileReader},
},
geospatial::bounding_box::BoundingBox,
};
use serde_json::Value;
use std::fs::File;

fn read_metadata(geospatial_test_file: &str) -> ParquetMetaData {
let path = format!(
"{}/geospatial/{geospatial_test_file}",
arrow::util::test_util::parquet_test_data(),
);
let file = File::open(path).unwrap();
let reader = SerializedFileReader::try_from(file).unwrap();
reader.metadata().clone()
}

#[test]
fn test_read_logical_type() {
// Some crs values are short strings
let expected_logical_type = [
("crs-default.parquet", LogicalType::Geometry { crs: None }),
(
"crs-srid.parquet",
LogicalType::Geometry {
crs: Some("srid:5070".to_string()),
},
),
(
"crs-projjson.parquet",
LogicalType::Geometry {
crs: Some("projjson:projjson_epsg_5070".to_string()),
},
),
(
"crs-geography.parquet",
LogicalType::Geography {
crs: None,
algorithm: Some(EdgeInterpolationAlgorithm::SPHERICAL),
},
),
];

for (geospatial_file, expected_type) in expected_logical_type {
let metadata = read_metadata(geospatial_file);
let logical_type = metadata
.file_metadata()
.schema_descr()
.column(1)
.logical_type()
.unwrap();

assert_eq!(logical_type, expected_type);
}

// The crs value may also contain arbitrary values (in this case some JSON
// a bit too lengthy to type out)
let metadata = read_metadata("crs-arbitrary-value.parquet");
let logical_type = metadata
.file_metadata()
.schema_descr()
.column(1)
.logical_type()
.unwrap();

if let LogicalType::Geometry { crs } = logical_type {
let crs_parsed: Value = serde_json::from_str(&crs.unwrap()).unwrap();
assert_eq!(crs_parsed.get("id").unwrap().get("code").unwrap(), 5070);
} else {
panic!("Expected geometry type but got {logical_type:?}");
}
}

#[test]
fn test_read_geospatial_statistics() {
let metadata = read_metadata("geospatial.parquet");

// geospatial.parquet schema:
// optional binary field_id=-1 group (String);
// optional binary field_id=-1 wkt (String);
// optional binary field_id=-1 geometry (Geometry(crs=));
let fields = metadata.file_metadata().schema().get_fields();
let logical_type = fields[2].get_basic_info().logical_type().unwrap();
assert_eq!(logical_type, LogicalType::Geometry { crs: None });

let geo_statistics = metadata.row_group(0).column(2).geo_statistics();
assert!(geo_statistics.is_some());

let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0)
.with_zrange(30.0, 80.0)
.with_mrange(200.0, 1600.0);
let expected_geospatial_types = vec![
1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 2001, 2002, 2003, 2004,
2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007,
];
assert_eq!(
geo_statistics.unwrap().geospatial_types(),
Some(&expected_geospatial_types)
);
assert_eq!(geo_statistics.unwrap().bounding_box(), Some(&expected_bbox));
}
Loading