Skip to content

Commit f49016d

Browse files
Add an efficient GeometryTypeAndDimensionsSet to improve the performance of geo statistics collection and ST_Analyze_Aggr (#285)
The idea is pretty simple: we use a 32-bit bitset to represent geometry type and dimensions. This replaces usages of HashSet<GeometryTypeAndDimensions> in `GeoStatistics` and reduces the overhead of updating geo statistics in `AnalyzeAccumulator`. We care about the performance geo statistics analyzer because it is applied to all geometries on the indexed side when running a spatial join. `AnalyzeAccumulator` and ST_Analyze_Aggr can be useful in some other places as well, so we'd like them to have minimal performance overhead. Here is the benchmark result of st_analyze_aggr after applying this patch: ``` Gnuplot not found, using plotters backend native-st_analyze_aggr-Array(Point) time: [4.1267 ms 4.2026 ms 4.3423 ms] change: [-87.458% -87.216% -86.808%] (p = 0.00 < 0.05) Performance has improved. Found 8 outliers among 100 measurements (8.00%) 6 (6.00%) high mild 2 (2.00%) high severe native-st_analyze_aggr-Array(LineString(10)) time: [5.6607 ms 5.6728 ms 5.6868 ms] change: [-83.578% -83.529% -83.482%] (p = 0.00 < 0.05) Performance has improved. Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) high severe ``` Co-authored-by: Copilot <[email protected]>
1 parent 3e6cfe3 commit f49016d

File tree

4 files changed

+511
-74
lines changed

4 files changed

+511
-74
lines changed

rust/sedona-expr/src/statistics.rs

Lines changed: 54 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@
1414
// KIND, either express or implied. See the License for the
1515
// specific language governing permissions and limitations
1616
// under the License.
17-
use std::{collections::HashSet, str::FromStr};
17+
use std::str::FromStr;
1818

1919
use datafusion_common::{stats::Precision, ColumnStatistics, DataFusionError, Result, ScalarValue};
2020
use sedona_geometry::interval::{Interval, IntervalTrait};
21-
use sedona_geometry::{bounding_box::BoundingBox, types::GeometryTypeAndDimensions};
21+
use sedona_geometry::{
22+
bounding_box::BoundingBox,
23+
types::{GeometryTypeAndDimensions, GeometryTypeAndDimensionsSet},
24+
};
2225
use serde::{Deserialize, Serialize};
2326

2427
/// Statistics specific to spatial data types
@@ -33,7 +36,7 @@ use serde::{Deserialize, Serialize};
3336
pub struct GeoStatistics {
3437
// Core spatial statistics for pruning
3538
bbox: Option<BoundingBox>, // The overall bounding box (min/max coordinates) containing all geometries
36-
geometry_types: Option<HashSet<GeometryTypeAndDimensions>>, // Set of all geometry types and dimensions present
39+
geometry_types: Option<GeometryTypeAndDimensionsSet>, // Set of all geometry types and dimensions present
3740

3841
// Extended statistics for analysis
3942
total_geometries: Option<i64>, // Total count of all geometries
@@ -73,16 +76,16 @@ impl GeoStatistics {
7376
pub fn empty() -> Self {
7477
Self {
7578
bbox: Some(BoundingBox::xy(Interval::empty(), Interval::empty())),
76-
geometry_types: Some(HashSet::new()), // Empty set of geometry types
77-
total_geometries: Some(0), // Zero geometries
78-
total_size_bytes: Some(0), // Zero bytes
79-
total_points: Some(0), // Zero points
80-
puntal_count: Some(0), // Zero point geometries
81-
lineal_count: Some(0), // Zero line geometries
82-
polygonal_count: Some(0), // Zero polygon geometries
83-
collection_count: Some(0), // Zero collection geometries
84-
total_envelope_width: Some(0.0), // Zero width
85-
total_envelope_height: Some(0.0), // Zero height
79+
geometry_types: Some(GeometryTypeAndDimensionsSet::new()), // Empty set of geometry types
80+
total_geometries: Some(0), // Zero geometries
81+
total_size_bytes: Some(0), // Zero bytes
82+
total_points: Some(0), // Zero points
83+
puntal_count: Some(0), // Zero point geometries
84+
lineal_count: Some(0), // Zero line geometries
85+
polygonal_count: Some(0), // Zero polygon geometries
86+
collection_count: Some(0), // Zero collection geometries
87+
total_envelope_width: Some(0.0), // Zero width
88+
total_envelope_height: Some(0.0), // Zero height
8689
}
8790
}
8891

@@ -92,20 +95,10 @@ impl GeoStatistics {
9295
}
9396

9497
/// Update the geometry types and return self
95-
pub fn with_geometry_types(self, types: Option<&[GeometryTypeAndDimensions]>) -> Self {
96-
match types {
97-
Some(type_slice) => {
98-
let type_set: HashSet<GeometryTypeAndDimensions> =
99-
type_slice.iter().cloned().collect();
100-
Self {
101-
geometry_types: Some(type_set),
102-
..self
103-
}
104-
}
105-
None => Self {
106-
geometry_types: None,
107-
..self
108-
},
98+
pub fn with_geometry_types(self, types: Option<GeometryTypeAndDimensionsSet>) -> Self {
99+
Self {
100+
geometry_types: types,
101+
..self
109102
}
110103
}
111104

@@ -115,7 +108,7 @@ impl GeoStatistics {
115108
}
116109

117110
/// Get the geometry types if available
118-
pub fn geometry_types(&self) -> Option<&HashSet<GeometryTypeAndDimensions>> {
111+
pub fn geometry_types(&self) -> Option<&GeometryTypeAndDimensionsSet> {
119112
self.geometry_types.as_ref()
120113
}
121114

@@ -290,9 +283,7 @@ impl GeoStatistics {
290283
if let Some(other_types) = &other.geometry_types {
291284
match &mut self.geometry_types {
292285
Some(types) => {
293-
let mut new_types = types.clone();
294-
new_types.extend(other_types.iter().cloned());
295-
self.geometry_types = Some(new_types);
286+
types.merge(other_types);
296287
}
297288
None => self.geometry_types = Some(other_types.clone()),
298289
}
@@ -374,13 +365,12 @@ impl GeoStatistics {
374365
pub fn try_with_str_geometry_types(self, geometry_types: Option<&[&str]>) -> Result<Self> {
375366
match geometry_types {
376367
Some(strings) => {
377-
let new_geometry_types = strings
378-
.iter()
379-
.map(|string| {
380-
GeometryTypeAndDimensions::from_str(string)
381-
.map_err(|e| DataFusionError::External(Box::new(e)))
382-
})
383-
.collect::<Result<HashSet<GeometryTypeAndDimensions>>>()?;
368+
let mut new_geometry_types = GeometryTypeAndDimensionsSet::new();
369+
for string in strings {
370+
let type_and_dim = GeometryTypeAndDimensions::from_str(string)
371+
.map_err(|e| DataFusionError::External(Box::new(e)))?;
372+
new_geometry_types.insert_or_ignore(&type_and_dim);
373+
}
384374

385375
Ok(Self {
386376
geometry_types: Some(new_geometry_types),
@@ -442,7 +432,10 @@ mod test {
442432
// Test with_bbox
443433
let stats = GeoStatistics::empty().with_bbox(Some(bbox.clone()));
444434
assert_eq!(stats.bbox(), Some(&bbox));
445-
assert_eq!(stats.geometry_types(), Some(HashSet::new()).as_ref());
435+
assert_eq!(
436+
stats.geometry_types(),
437+
Some(&GeometryTypeAndDimensionsSet::new())
438+
);
446439

447440
let regular_stats = stats.to_column_statistics().unwrap();
448441
assert_eq!(
@@ -459,15 +452,17 @@ mod test {
459452

460453
#[test]
461454
fn specified_geometry_types() {
462-
let type_array = [GeometryTypeAndDimensions::new(
463-
GeometryTypeId::Polygon,
464-
Dimensions::Xy,
465-
)];
455+
let mut types = GeometryTypeAndDimensionsSet::new();
456+
types
457+
.insert(&GeometryTypeAndDimensions::new(
458+
GeometryTypeId::Polygon,
459+
Dimensions::Xy,
460+
))
461+
.unwrap();
466462

467463
// Test with_geometry_types
468-
let stats = GeoStatistics::empty().with_geometry_types(Some(&type_array));
469-
let expected_set: HashSet<GeometryTypeAndDimensions> = type_array.iter().cloned().collect();
470-
assert_eq!(stats.geometry_types(), Some(&expected_set));
464+
let stats = GeoStatistics::empty().with_geometry_types(Some(types.clone()));
465+
assert_eq!(stats.geometry_types(), Some(&types));
471466
assert_eq!(
472467
stats.bbox(),
473468
Some(&BoundingBox::xy(Interval::empty(), Interval::empty()))
@@ -493,15 +488,19 @@ mod test {
493488
.try_with_str_geometry_types(Some(&["polygon", "point"]))
494489
.unwrap();
495490

496-
let mut expected_types = HashSet::new();
497-
expected_types.insert(GeometryTypeAndDimensions::new(
498-
GeometryTypeId::Polygon,
499-
Dimensions::Xy,
500-
));
501-
expected_types.insert(GeometryTypeAndDimensions::new(
502-
GeometryTypeId::Point,
503-
Dimensions::Xy,
504-
));
491+
let mut expected_types = GeometryTypeAndDimensionsSet::new();
492+
expected_types
493+
.insert(&GeometryTypeAndDimensions::new(
494+
GeometryTypeId::Polygon,
495+
Dimensions::Xy,
496+
))
497+
.unwrap();
498+
expected_types
499+
.insert(&GeometryTypeAndDimensions::new(
500+
GeometryTypeId::Point,
501+
Dimensions::Xy,
502+
))
503+
.unwrap();
505504

506505
assert_eq!(stats.geometry_types(), Some(&expected_types));
507506
assert_eq!(

rust/sedona-functions/src/st_analyze_aggr.rs

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ use sedona_expr::aggregate_udf::SedonaAggregateUDF;
3535
use sedona_expr::{aggregate_udf::SedonaAccumulator, statistics::GeoStatistics};
3636
use sedona_geometry::analyze::GeometryAnalysis;
3737
use sedona_geometry::interval::IntervalTrait;
38-
use sedona_geometry::types::GeometryTypeAndDimensions;
38+
use sedona_geometry::types::{GeometryTypeAndDimensions, GeometryTypeAndDimensionsSet};
3939
use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher};
4040
use wkb::reader::Wkb;
4141

@@ -353,18 +353,15 @@ impl AnalyzeAccumulator {
353353
let current_types = stats.geometry_types();
354354
let types = if let Some(existing_types) = current_types {
355355
let mut new_types = existing_types.clone();
356-
new_types.insert(geometry_type);
356+
new_types.insert_or_ignore(&geometry_type);
357357
Some(new_types)
358358
} else {
359-
Some(std::collections::HashSet::from([geometry_type]))
359+
let mut new_set = GeometryTypeAndDimensionsSet::new();
360+
new_set.insert_or_ignore(&geometry_type);
361+
Some(new_set)
360362
};
361363

362-
if let Some(type_set) = &types {
363-
let type_vec: Vec<GeometryTypeAndDimensions> = type_set.iter().cloned().collect();
364-
stats.with_geometry_types(Some(&type_vec))
365-
} else {
366-
stats.with_geometry_types(None)
367-
}
364+
stats.with_geometry_types(types)
368365
}
369366

370367
fn execute_update(&mut self, executor: WkbExecutor) -> Result<()> {
@@ -414,9 +411,10 @@ impl Accumulator for AnalyzeAccumulator {
414411
// Add approximate size for geometry types if present
415412
let types_size = match self.stats.geometry_types() {
416413
Some(types) => {
414+
// GeometryTypeAndDimensionsSet is a u32 bitset
417415
let elem_size = size_of::<GeometryTypeAndDimensions>();
418-
let capacity = types.capacity();
419-
capacity * elem_size
416+
let count = types.size();
417+
count * elem_size
420418
}
421419
None => 0,
422420
};

0 commit comments

Comments
 (0)