Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
916ec46
first go
paleolimbot Nov 7, 2025
5ccba21
python
paleolimbot Nov 7, 2025
bb465e8
string together more parts
paleolimbot Nov 8, 2025
c940e1a
fix for the example
paleolimbot Nov 8, 2025
86bd2fa
rat
paleolimbot Nov 8, 2025
eaa09c1
first test
paleolimbot Nov 11, 2025
9058561
towards projection
paleolimbot Nov 12, 2025
deed0a9
projecting?
paleolimbot Nov 13, 2025
a8106a0
format
paleolimbot Nov 13, 2025
710d3d7
test multi file
paleolimbot Nov 14, 2025
e775633
select a bunch of files
paleolimbot Nov 14, 2025
5d1102d
pass batch size through
paleolimbot Nov 14, 2025
ea97d28
start on the filter interface
paleolimbot Nov 14, 2025
ec5d638
start on filter bbox
paleolimbot Nov 15, 2025
2ae7c01
implement interval intersection
paleolimbot Nov 16, 2025
8673673
bounding box intersection
paleolimbot Nov 16, 2025
239bbe8
implement the intersection
paleolimbot Nov 16, 2025
7e31546
oogh
paleolimbot Nov 16, 2025
824acd7
vsizip, identify error with test
paleolimbot Nov 17, 2025
32b5b9a
fix error for bad file
paleolimbot Nov 17, 2025
41d4dc0
document Python
paleolimbot Nov 17, 2025
64766b5
document Python datasource
paleolimbot Nov 17, 2025
87ad44c
document py datasource
paleolimbot Nov 18, 2025
83b3b08
test object
paleolimbot Nov 18, 2025
8d81a6d
add test
paleolimbot Nov 19, 2025
524f6c8
test local filesystem
paleolimbot Nov 19, 2025
484a3d1
test projected reader
paleolimbot Nov 19, 2025
033bfc8
test spatial filter
paleolimbot Nov 19, 2025
173c3d3
document utility
paleolimbot Nov 19, 2025
96fb28b
test wraparound intersection
paleolimbot Nov 19, 2025
4bab9ce
test bounding box intersection
paleolimbot Nov 19, 2025
7666278
fix python tests
paleolimbot Nov 19, 2025
6568df3
Update python/sedonadb/python/sedonadb/datasource.py
paleolimbot Nov 19, 2025
76e88ca
Update rust/sedona-geometry/src/interval.rs
paleolimbot Nov 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions python/sedonadb/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ pyo3 = { version = "0.25.1" }
sedona = { path = "../../rust/sedona" }
sedona-adbc = { path = "../../rust/sedona-adbc" }
sedona-expr = { path = "../../rust/sedona-expr" }
sedona-datasource = { path = "../../rust/sedona-datasource" }
sedona-geometry = { path = "../../rust/sedona-geometry" }
sedona-geoparquet = { path = "../../rust/sedona-geoparquet" }
sedona-schema = { path = "../../rust/sedona-schema" }
sedona-proj = { path = "../../c/sedona-proj", default-features = false }
Expand Down
68 changes: 68 additions & 0 deletions python/sedonadb/python/sedonadb/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,74 @@ def read_parquet(
self.options,
)

def read_pyogrio(
self,
table_paths: Union[str, Path, Iterable[str]],
options: Optional[Dict[str, Any]] = None,
extension: str = "",
) -> DataFrame:
"""Read spatial file formats using GDAL/OGR via pyogrio

Creates a DataFrame from one or more paths or URLs to a file supported by
[pyogrio](https://pyogrio.readthedocs.io/en/latest/), which is the same package
that powers `geopandas.read_file()` by default. Some common formats that can be
opened using GDAL/OGR are FlatGeoBuf, GeoPackage, Shapefile, GeoJSON, and many,
many more. See <https://gdal.org/en/stable/drivers/vector/index.html> for a list
of available vector drivers.

Like `read_parquet()`, globs and directories can be specified in addition to
individual file paths. Paths ending in `.zip` are automatically prepended with
`/vsizip/` (i.e., are automatically unzipped by GDAL). HTTP(s) URLs are
supported via `/vsicurl/`.

Args:
table_paths: A str, Path, or iterable of paths containing URLs or
paths. Globs (i.e., `path/*.gpkg`), directories, and zipped
versions of otherwise readable files are supported.
options: An optional mapping of key/value pairs (open options)
passed to GDAL/OGR.
extension: An optional file extension (e.g., `"fgb"`) used when
`table_paths` specifies one or more directories or a glob
that does not enforce a file extension.

Examples:

>>> import geopandas
>>> import tempfile
>>> sd = sedona.db.connect()
>>> df = geopandas.GeoDataFrame({
... "geometry": geopandas.GeoSeries.from_wkt(["POINT (0 1)"], crs=3857)
... })
>>>
>>> with tempfile.TemporaryDirectory() as td:
... df.to_file(f"{td}/df.fgb")
... sd.read_pyogrio(f"{td}/df.fgb").show()
...
┌──────────────┐
│ wkb_geometry │
│ geometry │
╞══════════════╡
│ POINT(0 1) │
└──────────────┘

"""
from sedonadb.datasource import PyogrioFormatSpec

if isinstance(table_paths, (str, Path)):
table_paths = [table_paths]

spec = PyogrioFormatSpec(extension)
if options is not None:
spec = spec.with_options(options)

return DataFrame(
self._impl,
self._impl.read_external_format(
spec, [str(path) for path in table_paths], False
),
self.options,
)

def sql(self, sql: str) -> DataFrame:
"""Create a [DataFrame][sedonadb.dataframe.DataFrame] by executing SQL

Expand Down
194 changes: 194 additions & 0 deletions python/sedonadb/python/sedonadb/datasource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from typing import Any, Mapping

from sedonadb._lib import PyExternalFormat, PyProjectedRecordBatchReader


class ExternalFormatSpec:
"""Python file format specification

This class defines an abstract "file format", which maps to the DataFusion
concept of a `FileFormat`. This is a layer on top of the `TableProvider` that
provides standard support for querying collections of files using globs
or directories of files with compatible schemas. This abstraction allows for
basic support for pruning and partial filter pushdown (e.g., a bounding box
is available if one was provided in the underlying query); however, data
providers with more advanced features may wish to implement a `TableProvider`
in Rust to take advantage of a wider range of DataFusion features.

Implementations are only required to implement `open_reader()`; however, if
opening a reader is expensive and there is a more efficient way to infer a
schema from a given source, implementers may wish to also implement
`infer_schema()`.

This extension point is experimental and may evolve to serve the needs of
various file formats.
"""

@property
def extension(self):
"""A file extension for files that match this format

If this concept is not important for this format, returns an empty string.
"""
return ""

def with_options(self, options: Mapping[str, Any]):
"""Clone this instance and return a new instance with options applied

Apply an arbitrary set of format-defined key/value options. It is useful
to raise an error in this method if an option or value will later result
in an error; however, implementation may defer the error until later if
required by the underlying producer.

The default implementation of this method errors for any attempt to
pass options.
"""
raise NotImplementedError(
f"key/value options not supported by {type(self).__name__}"
)

def open_reader(self, args: Any):
"""Open an ArrowArrayStream/RecordBatchReader of batches given input information

Note that the output stream must take into account `args.file_projection`, if one
exists (`PyProjectedRecordBatchReader` may be used to ensure a set of output
columns or apply an output projection on an input stream.

The internals will keep a strong (Python) reference to the returned object
for as long as batches are being produced.

Args:
args: An object with attributes
- `src`: An object/file abstraction. Currently, `.to_url()` is the best way
to extract the underlying URL from the source.
- `filter`: An object representing the filter expression that was pushed
down, if one exists. Currently, `.bounding_box(column_index)` is the only
way to interact with this object.
- `file_schema`: An optional schema. If `None`, the implementation must
infer the schema.
- `file_projection`: An optional list of integers of the columns of
`file_schema` that must be produced by this implementation (in the
exact order specified).
- `batch_size`: An optional integer specifying the number of rows requested
for each output batch.

"""
raise NotImplementedError()

def infer_schema(self, src):
"""Infer the output schema

Implementations can leave this unimplemented, in which case the internals will call
`open_reader()` and query the provided schema without pulling any batches.

Args:
src: An object/file abstraction. Currently, `.to_url()` is the best way
to extract the underlying URL from the source.
"""
raise NotImplementedError()

def __sedona_external_format__(self):
return PyExternalFormat(self)


class PyogrioFormatSpec(ExternalFormatSpec):
"""An `ExternalFormatSpec` implementation wrapping GDAL/OGR via pyogrio"""

def __init__(self, extension=""):
self._extension = extension
self._options = {}

def with_options(self, options):
cloned = type(self)(self.extension)
cloned._options.update(options)
return cloned

@property
def extension(self) -> str:
return self._extension

def open_reader(self, args):
import pyogrio.raw

url = args.src.to_url()
if url is None:
raise ValueError(f"Can't convert {args.src} to OGR-openable object")

if url.startswith("http://") or url.startswith("https://"):
ogr_src = f"/vsicurl/{url}"
elif url.startswith("file://"):
ogr_src = url.removeprefix("file://")
else:
raise ValueError(f"Can't open {url} with OGR")

if ogr_src.endswith(".zip"):
ogr_src = f"/vsizip/{ogr_src}"

if args.is_projected():
file_names = args.file_schema.names
columns = [file_names[i] for i in args.file_projection]
else:
columns = None

batch_size = args.batch_size if args.batch_size is not None else 0

if args.filter and args.file_schema is not None:
geometry_column_indices = args.file_schema.geometry_column_indices
if len(geometry_column_indices) == 1:
bbox = args.filter.bounding_box(geometry_column_indices[0])
else:
bbox = None
else:
bbox = None

return PyogrioReaderShelter(
pyogrio.raw.ogr_open_arrow(
ogr_src, {}, columns=columns, batch_size=batch_size, bbox=bbox
),
columns,
)


class PyogrioReaderShelter:
"""Python object wrapper around the context manager returned by pyogrio

The pyogrio object returned by `pyogrio.raw.ogr_open_arrow()` is a context
manager; however, the internals can only manage Rust object references.
This object ensures that the context manager is closed when the object
is deleted (which occurs as soon as possible when the returned reader
is no longer required).
"""

def __init__(self, inner, output_names=None):
self._inner = inner
self._output_names = output_names
self._meta, self._reader = self._inner.__enter__()

def __del__(self):
self._inner.__exit__(None, None, None)

def __arrow_c_stream__(self, requested_schema=None):
if self._output_names is None:
return self._reader.__arrow_c_stream__()
else:
projected = PyProjectedRecordBatchReader(
self._reader, None, self._output_names
)
return projected.__arrow_c_stream__()
21 changes: 21 additions & 0 deletions python/sedonadb/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use tokio::runtime::Runtime;

use crate::{
dataframe::InternalDataFrame,
datasource::PyExternalFormat,
error::PySedonaError,
import_from::{import_ffi_scalar_udf, import_table_provider_from_any},
runtime::wait_for_future,
Expand Down Expand Up @@ -107,6 +108,26 @@ impl InternalContext {
Ok(InternalDataFrame::new(df, self.runtime.clone()))
}

pub fn read_external_format<'py>(
&self,
py: Python<'py>,
format_spec: Bound<PyAny>,
table_paths: Vec<String>,
check_extension: bool,
) -> Result<InternalDataFrame, PySedonaError> {
let spec = format_spec
.call_method0("__sedona_external_format__")?
.extract::<PyExternalFormat>()?;
let df = wait_for_future(
py,
&self.runtime,
self.inner
.read_external_format(Arc::new(spec), table_paths, None, check_extension),
)??;

Ok(InternalDataFrame::new(df, self.runtime.clone()))
}

pub fn sql<'py>(
&self,
py: Python<'py>,
Expand Down
Loading
Loading