Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions rust/sedona-geoparquet/src/file_opener.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use datafusion::datasource::{
};
use datafusion_common::Result;
use datafusion_physical_expr::PhysicalExpr;
use datafusion_physical_plan::metrics::{Count, ExecutionPlanMetricsSet, MetricBuilder};
use object_store::ObjectStore;
use parquet::file::{
metadata::{ParquetMetaData, RowGroupMetaData},
Expand All @@ -35,6 +36,37 @@ use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher};

use crate::metadata::GeoParquetMetadata;

#[derive(Clone)]
struct GeoParquetFileOpenerMetrics {
/// How many file ranges are pruned by [`SpatialFilter`]
///
/// Note on "file range": an opener may read only part of a file rather than the
/// entire file; that portion is referred to as the "file range". See [`PartitionedFile`]
/// for details.
files_ranges_spatial_pruned: Count,
/// How many file ranges are matched by [`SpatialFilter`]
files_ranges_spatial_matched: Count,
/// How many row groups are pruned by [`SpatialFilter`]
row_groups_spatial_pruned: Count,
/// How many row groups are matched by [`SpatialFilter`]
row_groups_spatial_matched: Count,
}

impl GeoParquetFileOpenerMetrics {
fn new(execution_plan_global_metrics: &ExecutionPlanMetricsSet) -> Self {
Self {
files_ranges_spatial_pruned: MetricBuilder::new(execution_plan_global_metrics)
.global_counter("files_ranges_spatial_pruned"),
files_ranges_spatial_matched: MetricBuilder::new(execution_plan_global_metrics)
.global_counter("files_ranges_spatial_matched"),
row_groups_spatial_pruned: MetricBuilder::new(execution_plan_global_metrics)
.global_counter("row_groups_spatial_pruned"),
row_groups_spatial_matched: MetricBuilder::new(execution_plan_global_metrics)
.global_counter("row_groups_spatial_matched"),
}
}
}

/// Geo-aware [FileOpener] implementing file and row group pruning
///
/// Pruning happens (for Parquet) in the [FileOpener], so we implement
Expand All @@ -47,6 +79,7 @@ pub struct GeoParquetFileOpener {
predicate: Arc<dyn PhysicalExpr>,
file_schema: SchemaRef,
enable_pruning: bool,
metrics: GeoParquetFileOpenerMetrics,
}

impl GeoParquetFileOpener {
Expand All @@ -58,6 +91,7 @@ impl GeoParquetFileOpener {
predicate: Arc<dyn PhysicalExpr>,
file_schema: SchemaRef,
enable_pruning: bool,
execution_plan_global_metrics: &ExecutionPlanMetricsSet,
) -> Self {
Self {
inner,
Expand All @@ -66,6 +100,7 @@ impl GeoParquetFileOpener {
predicate,
file_schema,
enable_pruning,
metrics: GeoParquetFileOpenerMetrics::new(execution_plan_global_metrics),
}
}
}
Expand Down Expand Up @@ -96,6 +131,7 @@ impl FileOpener for GeoParquetFileOpener {
&mut access_plan,
&spatial_filter,
&geoparquet_metadata,
&self_clone.metrics,
)?;

filter_access_plan_using_geoparquet_covering(
Expand All @@ -104,6 +140,7 @@ impl FileOpener for GeoParquetFileOpener {
&spatial_filter,
&geoparquet_metadata,
&parquet_metadata,
&self_clone.metrics,
)?;
}
}
Expand Down Expand Up @@ -135,12 +172,16 @@ fn filter_access_plan_using_geoparquet_file_metadata(
access_plan: &mut ParquetAccessPlan,
spatial_filter: &SpatialFilter,
metadata: &GeoParquetMetadata,
metrics: &GeoParquetFileOpenerMetrics,
) -> Result<()> {
let table_geo_stats = geoparquet_file_geo_stats(file_schema, metadata)?;
if !spatial_filter.evaluate(&table_geo_stats) {
metrics.files_ranges_spatial_pruned.add(1);
for i in access_plan.row_group_indexes() {
access_plan.skip(i);
}
} else {
metrics.files_ranges_spatial_matched.add(1);
}

Ok(())
Expand All @@ -156,6 +197,7 @@ fn filter_access_plan_using_geoparquet_covering(
spatial_filter: &SpatialFilter,
metadata: &GeoParquetMetadata,
parquet_metadata: &ParquetMetaData,
metrics: &GeoParquetFileOpenerMetrics,
) -> Result<()> {
let row_group_indices_to_scan = access_plan.row_group_indexes();

Expand All @@ -176,7 +218,10 @@ fn filter_access_plan_using_geoparquet_covering(

// Evaluate predicate!
if !spatial_filter.evaluate(&row_group_geo_stats) {
metrics.row_groups_spatial_pruned.add(1);
access_plan.skip(i);
} else {
metrics.row_groups_spatial_matched.add(1);
}
}

Expand Down
17 changes: 13 additions & 4 deletions rust/sedona-geoparquet/src/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,15 +364,21 @@ impl GeoParquetFileSource {
predicate: Option<Arc<dyn PhysicalExpr>>,
) -> Result<Self> {
if let Some(parquet_source) = inner.as_any().downcast_ref::<ParquetSource>() {
let mut parquet_source = parquet_source.clone();
let parquet_source = parquet_source.clone();
// Extract the predicate from the existing source if it exists so we can keep a copy of it
let new_predicate = match (parquet_source.predicate().cloned(), predicate) {
(None, None) => None,
(None, Some(specified_predicate)) => Some(specified_predicate),
(Some(inner_predicate), None) => Some(inner_predicate),
(Some(_), Some(specified_predicate)) => {
parquet_source = parquet_source.with_predicate(specified_predicate.clone());
Copy link
Copy Markdown
Contributor Author

@2010YOUY01 2010YOUY01 Oct 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DataFusion's with_predicate() API will reset metrics unexpectedly. See apache/datafusion#17858
I checked the related implementation, the predicate inside inner ParquetSource and the predicate in GeoParquetFileSource should always be the same, so here I made the implementation more defensive, to avoid the datafusion bug that clear the metrics.

Some(specified_predicate)
(Some(inner_predicate), Some(specified_predicate)) => {
// Sanity check: predicate in `GeoParquetFileSource` is init
// from its inner ParquetSource's predicate, they should be
// equivalent.
if Arc::ptr_eq(&inner_predicate, &specified_predicate) {
Some(inner_predicate)
} else {
return sedona_internal_err!("Inner predicate should be equivalent to the predicate in `GeoParqeutFileSource`");
}
}
};

Expand Down Expand Up @@ -452,6 +458,9 @@ impl FileSource for GeoParquetFileSource {
self.predicate.clone().unwrap(),
base_config.file_schema.clone(),
self.inner.table_parquet_options().global.pruning,
// HACK: Since there is no public API to set inner's metrics, so we use
// inner's metrics as the ExecutionPlan-global metrics
self.inner.metrics(),
))
}

Expand Down
97 changes: 97 additions & 0 deletions rust/sedona/tests/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use datafusion::arrow::util::pretty::pretty_format_batches;
use sedona::context::SedonaContext;

#[tokio::test]
async fn geo_parquet_metrics() {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the parquet specific test be in rust/sedona-geoparquet/tests/ instead of rust/sedona/tests/ for better organization?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's true that all of our other pruning tests are in sedona-geoparquet (for better or worse! The top level sedona crate didn't exist when I wrote them..). We don't have access to a real ST_Intersects() there but we do have a fake one to test pruning:

#[rstest]
#[tokio::test]
async fn pruning_geoparquet_metadata(#[values("st_intersects", "st_contains")] udf_name: &str) {

I'd prefer to keep the pruning tests together in sedona-geoparquet but also happy to have some integration-y tests here if there's some technical reason they can't live there 🙂

Copy link
Copy Markdown
Contributor Author

@2010YOUY01 2010YOUY01 Oct 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For some technical reasons, those tests are easier to be implemented as e2e/integration tests.
If we want to make it in sedona-geoparquet for better organization, it would be testing against some very low-level utility functions, then the issue is they're very volatile -- some simple refactor will require those tests to be rewritten, while integration tests are more stable.

// Setup and register test table
// -----------------------------
let ctx = SedonaContext::new_local_interactive()
.await
.expect("interactive context should initialize");

let geo_parquet_path = "../../submodules/sedona-testing/data/parquet/geoparquet-1.1.0.parquet";
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a helper for this one (mostly to give an actionable error if somebody forgot to initialize the submodules):

/// Find the most likely path to the geoarrow-data testing directory if it exists
///
/// This looks for a geoarrow-data checkout using the value of SEDONA_GEOARROW_DATA_DIR,
/// the directory that would be valid if running cargo run from the repository root,
/// or the directory that would be valid if running cargo test (in that order).
pub fn geoarrow_data_dir() -> Result<String> {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah right, that's for geoarrow-data and not sedona-testing. No need to add a second helper, this is great as is!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea! Done in cac92f8

let create_table_sql =
format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{geo_parquet_path}'");

ctx.sql(&create_table_sql)
.await
.expect("create table should succeed")
.collect()
.await
.expect("collecting create table result should succeed");

// Test 1: query with spatial predicate that pruned the entire file
// ----------------------------------------------------------------
let prune_query = r#"
EXPLAIN ANALYZE
SELECT *
FROM test
WHERE ST_Intersects(
geometry,
ST_SetSRID(
ST_GeomFromText('POLYGON((-10 84, -10 88, 10 88, 10 84, -10 84))'),
4326
)
)
"#;

let prune_plan = run_and_format(&ctx, prune_query).await;
assert!(prune_plan.contains("files_ranges_spatial_pruned=1"));
assert!(prune_plan.contains("files_ranges_spatial_matched=0"));
assert!(prune_plan.contains("row_groups_spatial_pruned=0"));
assert!(prune_plan.contains("row_groups_spatial_matched=0"));

// Test 2: query with spatial filter that can't skip any file or row group
// -----------------------------------------------------------------------
let match_query = r#"
EXPLAIN ANALYZE
SELECT *
FROM test
WHERE ST_Intersects(
geometry,
ST_SetSRID(
ST_GeomFromText(
'POLYGON((-180 -18.28799, -180 83.23324, 180 83.23324, 180 -18.28799, -180 -18.28799))'
),
4326
)
)
"#;

let match_plan = run_and_format(&ctx, match_query).await;
assert!(match_plan.contains("files_ranges_spatial_pruned=0"));
assert!(match_plan.contains("files_ranges_spatial_matched=1"));
assert!(match_plan.contains("row_groups_spatial_pruned=0"));
assert!(match_plan.contains("row_groups_spatial_matched=1"));
}

async fn run_and_format(ctx: &SedonaContext, sql: &str) -> String {
let df = ctx
.sql(sql.trim())
.await
.expect("explain analyze query should succeed");
let batches = df
.collect()
.await
.expect("collecting explain analyze result should succeed");
format!(
"{}",
pretty_format_batches(&batches).expect("formatting plan should succeed")
)
}
Loading