Skip to content

Commit 9c99a1f

Browse files
authored
Python: Use pyo3-object_store for reading data from remote object stores (#849)
1 parent 8bad0f6 commit 9c99a1f

File tree

15 files changed

+125
-767
lines changed

15 files changed

+125
-767
lines changed

python/Cargo.lock

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

python/geoarrow-io/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ default = ["async"]
2222
async = [
2323
"dep:futures",
2424
"dep:object_store",
25+
"dep:pyo3-object_store",
2526
"parquet/object_store",
2627
"dep:pyo3-async-runtimes",
2728
"geoarrow/flatgeobuf_async",
@@ -48,6 +49,7 @@ pyo3-arrow = { workspace = true }
4849
pyo3-async-runtimes = { version = "0.22", features = [
4950
"tokio-runtime",
5051
], optional = true }
52+
pyo3-object_store = { version = "0.1.0-beta.1", optional = true }
5153
pythonize = "0.22"
5254
geo = "0.28"
5355
geo-traits = { workspace = true }

python/geoarrow-io/python/geoarrow/rust/io/_io.pyi

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ from __future__ import annotations
33
from pathlib import Path
44
from typing import (
55
BinaryIO,
6-
Dict,
76
List,
87
Optional,
98
Sequence,
@@ -24,15 +23,15 @@ from .enums import GeoParquetEncoding
2423
from .types import BboxCovering, GeoParquetEncodingT
2524

2625
class ParquetFile:
27-
def __init__(self, path: str, fs: ObjectStore) -> None:
26+
def __init__(self, path: str, store: ObjectStore) -> None:
2827
"""
2928
Construct a new ParquetFile
3029
3130
This will synchronously fetch metadata from the provided path
3231
3332
Args:
3433
path: a string URL to read from.
35-
fs: the file system interface to read from.
34+
store: the file system interface to read from.
3635
3736
Returns:
3837
A new ParquetFile object.
@@ -133,15 +132,15 @@ class ParquetFile:
133132
"""
134133

135134
class ParquetDataset:
136-
def __init__(self, paths: Sequence[str], fs: ObjectStore) -> None:
135+
def __init__(self, paths: Sequence[str], store: ObjectStore) -> None:
137136
"""
138137
Construct a new ParquetDataset
139138
140139
This will synchronously fetch metadata from all listed files.
141140
142141
Args:
143142
paths: a list of string URLs to read from.
144-
fs: the file system interface to read from.
143+
store: the file system interface to read from.
145144
146145
Returns:
147146
A new ParquetDataset object.
@@ -241,9 +240,6 @@ class ParquetWriter:
241240
table: _description_
242241
"""
243242

244-
class ObjectStore:
245-
def __init__(self, root: str, options: Optional[Dict[str, str]] = None) -> None: ...
246-
247243
def read_csv(
248244
file: str | Path | BinaryIO,
249245
geometry_column_name: str,
@@ -265,7 +261,7 @@ def read_csv(
265261
def read_flatgeobuf(
266262
file: Union[str, Path, BinaryIO],
267263
*,
268-
fs: Optional[ObjectStore] = None,
264+
store: Optional[ObjectStore] = None,
269265
batch_size: int = 65536,
270266
bbox: Tuple[float, float, float, float] | None = None,
271267
) -> Table:
@@ -309,15 +305,15 @@ def read_flatgeobuf(
309305
"aws_secret_access_key": "...",
310306
"aws_region": "..."
311307
}
312-
fs = ObjectStore('s3://bucket', options=options)
313-
table = read_flatgeobuf("path/in/bucket.fgb", fs=fs)
308+
store = ObjectStore('s3://bucket', options=options)
309+
table = read_flatgeobuf("path/in/bucket.fgb", store=store)
314310
```
315311
316312
Args:
317313
file: the path to the file or a Python file object in binary read mode.
318314
319315
Other args:
320-
fs: an ObjectStore instance for this url. This is required only if the file is at a remote
316+
store: an ObjectStore instance for this url. This is required only if the file is at a remote
321317
location.
322318
batch_size: the number of rows to include in each internal batch of the table.
323319
bbox: A spatial filter for reading rows, of the format (minx, miny, maxx, maxy). If set to
@@ -330,7 +326,7 @@ def read_flatgeobuf(
330326
async def read_flatgeobuf_async(
331327
path: str,
332328
*,
333-
fs: Optional[ObjectStore] = None,
329+
store: Optional[ObjectStore] = None,
334330
batch_size: int = 65536,
335331
bbox: Tuple[float, float, float, float] | None = None,
336332
) -> Table:
@@ -358,17 +354,17 @@ async def read_flatgeobuf_async(
358354
"aws_secret_access_key": "...",
359355
"aws_region": "..."
360356
}
361-
fs = ObjectStore('s3://bucket', options=options)
362-
table = await read_flatgeobuf_async("path/in/bucket.fgb", fs=fs)
357+
store = ObjectStore('s3://bucket', options=options)
358+
table = await read_flatgeobuf_async("path/in/bucket.fgb", store=store)
363359
```
364360
365361
Args:
366362
path: the url or relative path to a remote FlatGeobuf file. If an argument is passed for
367-
`fs`, this should be a path fragment relative to the root passed to the `ObjectStore`
363+
`store`, this should be a path fragment relative to the root passed to the `ObjectStore`
368364
constructor.
369365
370366
Other args:
371-
fs: an ObjectStore instance for this url. This is required for non-HTTP urls.
367+
store: an ObjectStore instance for this url. This is required for non-HTTP urls.
372368
batch_size: the number of rows to include in each internal batch of the table.
373369
bbox: A spatial filter for reading rows, of the format (minx, miny, maxx, maxy). If set to
374370
`None`, no spatial filtering will be performed.
@@ -409,7 +405,7 @@ def read_geojson_lines(
409405
def read_parquet(
410406
path: Union[str, Path, BinaryIO],
411407
*,
412-
fs: Optional[ObjectStore] = None,
408+
store: Optional[ObjectStore] = None,
413409
batch_size: int = 65536,
414410
) -> Table:
415411
"""
@@ -443,13 +439,13 @@ def read_parquet(
443439
"aws_secret_access_key": "...",
444440
"aws_region": "..."
445441
}
446-
fs = ObjectStore('s3://bucket', options=options)
447-
table = read_parquet("path/in/bucket.parquet", fs=fs)
442+
store = ObjectStore('s3://bucket', options=options)
443+
table = read_parquet("path/in/bucket.parquet", store=store)
448444
```
449445
450446
Args:
451447
path: the path to the file
452-
fs: the ObjectStore to read from. Defaults to None.
448+
store: the ObjectStore to read from. Defaults to None.
453449
batch_size: the number of rows to include in each internal batch of the table.
454450
455451
Returns:
@@ -459,7 +455,7 @@ def read_parquet(
459455
async def read_parquet_async(
460456
path: Union[str, Path, BinaryIO],
461457
*,
462-
fs: Optional[ObjectStore] = None,
458+
store: Optional[ObjectStore] = None,
463459
batch_size: int = 65536,
464460
) -> Table:
465461
"""
@@ -486,13 +482,13 @@ async def read_parquet_async(
486482
"aws_secret_access_key": "...",
487483
"aws_region": "..."
488484
}
489-
fs = ObjectStore('s3://bucket', options=options)
490-
table = await read_parquet_async("path/in/bucket.parquet", fs=fs)
485+
store = ObjectStore('s3://bucket', options=options)
486+
table = await read_parquet_async("path/in/bucket.parquet", store=store)
491487
```
492488
493489
Args:
494490
path: the path to the file
495-
fs: the ObjectStore to read from. Defaults to None.
491+
store: the ObjectStore to read from. Defaults to None.
496492
batch_size: the number of rows to include in each internal batch of the table.
497493
498494
Returns:

python/geoarrow-io/src/io/flatgeobuf/async.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,15 @@ use pyo3::prelude::*;
88
use pyo3_async_runtimes::tokio::future_into_py;
99

1010
#[pyfunction]
11-
#[pyo3(signature = (path, *, fs=None, batch_size=65536, bbox=None))]
11+
#[pyo3(signature = (path, *, store=None, batch_size=65536, bbox=None))]
1212
pub fn read_flatgeobuf_async(
1313
py: Python,
1414
path: PyObject,
15-
fs: Option<PyObject>,
15+
store: Option<PyObject>,
1616
batch_size: usize,
1717
bbox: Option<(f64, f64, f64, f64)>,
1818
) -> PyGeoArrowResult<PyObject> {
19-
let reader = construct_reader(py, path, fs)?;
19+
let reader = construct_reader(py, path, store)?;
2020
match reader {
2121
AnyFileReader::Async(async_reader) => {
2222
let fut = future_into_py(py, async move {

python/geoarrow-io/src/io/flatgeobuf/sync.rs

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,31 +9,37 @@ use pyo3::prelude::*;
99
use pyo3_arrow::input::AnyRecordBatch;
1010

1111
#[pyfunction]
12-
#[pyo3(signature = (file, *, fs=None, batch_size=65536, bbox=None))]
12+
#[pyo3(signature = (file, *, store=None, batch_size=65536, bbox=None))]
1313
pub fn read_flatgeobuf(
1414
py: Python,
1515
file: PyObject,
16-
fs: Option<PyObject>,
16+
store: Option<PyObject>,
1717
batch_size: usize,
1818
bbox: Option<(f64, f64, f64, f64)>,
1919
) -> PyGeoArrowResult<PyObject> {
20-
let reader = construct_reader(py, file, fs)?;
20+
let reader = construct_reader(py, file, store)?;
2121
match reader {
2222
#[cfg(feature = "async")]
23-
AnyFileReader::Async(async_reader) => async_reader.runtime.block_on(async move {
24-
use geoarrow::io::flatgeobuf::read_flatgeobuf_async as _read_flatgeobuf_async;
23+
AnyFileReader::Async(async_reader) => {
24+
use crate::runtime::get_runtime;
2525

26-
let options = FlatGeobufReaderOptions {
27-
batch_size: Some(batch_size),
28-
bbox,
29-
..Default::default()
30-
};
31-
let table = _read_flatgeobuf_async(async_reader.store, async_reader.path, options)
32-
.await
33-
.map_err(PyGeoArrowError::GeoArrowError)?;
26+
let runtime = get_runtime(py)?;
3427

35-
Ok(table_to_pytable(table).to_arro3(py)?)
36-
}),
28+
runtime.block_on(async move {
29+
use geoarrow::io::flatgeobuf::read_flatgeobuf_async as _read_flatgeobuf_async;
30+
31+
let options = FlatGeobufReaderOptions {
32+
batch_size: Some(batch_size),
33+
bbox,
34+
..Default::default()
35+
};
36+
let table = _read_flatgeobuf_async(async_reader.store, async_reader.path, options)
37+
.await
38+
.map_err(PyGeoArrowError::GeoArrowError)?;
39+
40+
Ok(table_to_pytable(table).to_arro3(py)?)
41+
})
42+
}
3743
AnyFileReader::Sync(mut sync_reader) => {
3844
let options = FlatGeobufReaderOptions {
3945
batch_size: Some(batch_size),

python/geoarrow-io/src/io/input/mod.rs

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,22 @@ use std::sync::Arc;
44

55
use crate::error::PyGeoArrowResult;
66
#[cfg(feature = "async")]
7-
use crate::io::object_store::PyObjectStore;
8-
#[cfg(feature = "async")]
97
use object_store::http::HttpBuilder;
108
#[cfg(feature = "async")]
119
use object_store::path::Path;
1210
#[cfg(feature = "async")]
1311
use object_store::{ClientOptions, ObjectStore};
14-
use pyo3::exceptions::PyValueError;
12+
#[cfg(feature = "async")]
13+
use pyo3_object_store::PyObjectStore;
1514
use sync::FileReader;
1615

1716
use pyo3::prelude::*;
18-
#[cfg(feature = "async")]
19-
use tokio::runtime::Runtime;
2017
use url::Url;
2118

2219
#[cfg(feature = "async")]
2320
pub struct AsyncFileReader {
2421
pub store: Arc<dyn ObjectStore>,
2522
pub path: Path,
26-
pub runtime: Arc<Runtime>,
2723
}
2824

2925
pub enum AnyFileReader {
@@ -39,16 +35,15 @@ pub enum AnyFileReader {
3935
pub fn construct_reader(
4036
py: Python,
4137
file: PyObject,
42-
fs: Option<PyObject>,
38+
store: Option<PyObject>,
4339
) -> PyGeoArrowResult<AnyFileReader> {
4440
// If the user passed an object store instance, use that
4541
#[cfg(feature = "async")]
46-
if let Some(fs) = fs {
47-
let fs = fs.extract::<PyObjectStore>(py)?;
42+
if let Some(store) = store {
43+
let store = store.extract::<PyObjectStore>(py)?;
4844
let path = file.extract::<String>(py)?;
4945
let async_reader = AsyncFileReader {
50-
store: fs.inner,
51-
runtime: fs.rt,
46+
store: store.into_inner(),
5247
path: path.into(),
5348
};
5449
return Ok(AnyFileReader::Async(async_reader));
@@ -70,13 +65,8 @@ pub fn construct_reader(
7065
.build()?;
7166
let path = url.path().trim_start_matches('/');
7267

73-
let runtime = Arc::new(
74-
tokio::runtime::Runtime::new()
75-
.map_err(|err| PyValueError::new_err(err.to_string()))?,
76-
);
7768
let async_reader = AsyncFileReader {
7869
store: Arc::new(store),
79-
runtime,
8070
path: path.into(),
8171
};
8272
return Ok(AnyFileReader::Async(async_reader));

python/geoarrow-io/src/io/mod.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@ pub mod flatgeobuf;
55
pub mod geojson;
66
pub mod geojson_lines;
77
pub mod input;
8-
#[cfg(feature = "async")]
9-
pub mod object_store;
108
pub mod parquet;
119
#[cfg(feature = "async")]
1210
pub mod postgis;

0 commit comments

Comments
 (0)