Skip to content

Commit 6c8292f

Browse files
authored
feat: expose http object store (#885)
* feat: expose HTTP ObjectStore The objective is to allow the user to register CSV directly from an HTTP URL, delaying downloading the file until required * chore: return PyResult
1 parent ec8246d commit 6c8292f

File tree

8 files changed

+51
-23
lines changed

8 files changed

+51
-23
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ uuid = { version = "1.9", features = ["v4"] }
4747
mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }
4848
async-trait = "0.1"
4949
futures = "0.3"
50-
object_store = { version = "0.11.0", features = ["aws", "gcp", "azure"] }
50+
object_store = { version = "0.11.0", features = ["aws", "gcp", "azure", "http"] }
5151
parking_lot = "0.12"
5252
regex-syntax = "0.8"
5353
syn = "2.0.79"

python/datafusion/context.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,9 @@ def __init__(
450450

451451
self.ctx = SessionContextInternal(config, runtime)
452452

453-
def register_object_store(self, schema: str, store: Any, host: str | None) -> None:
453+
def register_object_store(
454+
self, schema: str, store: Any, host: str | None = None
455+
) -> None:
454456
"""Add a new object store into the session.
455457
456458
Args:

python/datafusion/object_store.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,6 @@
2222
GoogleCloud = object_store.GoogleCloud
2323
LocalFileSystem = object_store.LocalFileSystem
2424
MicrosoftAzure = object_store.MicrosoftAzure
25+
Http = object_store.Http
2526

26-
__all__ = [
27-
"AmazonS3",
28-
"GoogleCloud",
29-
"LocalFileSystem",
30-
"MicrosoftAzure",
31-
]
32-
33-
34-
def __getattr__(name):
35-
return getattr(object_store, name)
27+
__all__ = ["AmazonS3", "GoogleCloud", "LocalFileSystem", "MicrosoftAzure", "Http"]

python/tests/test_sql.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from pyarrow.csv import write_csv
2323
import pyarrow.dataset as ds
2424
import pytest
25-
from datafusion.object_store import LocalFileSystem
25+
from datafusion.object_store import Http
2626

2727
from datafusion import udf, col
2828

@@ -139,6 +139,15 @@ def test_register_csv_list(ctx, tmp_path):
139139
assert int_sum == 2 * sum(int_values)
140140

141141

142+
def test_register_http_csv(ctx):
143+
url = "https://raw.githubusercontent.com/ibis-project/testing-data/refs/heads/master/csv/diamonds.csv"
144+
ctx.register_object_store("", Http(url))
145+
ctx.register_csv("remote", url)
146+
assert ctx.table_exist("remote")
147+
res, *_ = ctx.sql("SELECT COUNT(*) AS total FROM remote").to_pylist()
148+
assert res["total"] > 0
149+
150+
142151
def test_register_parquet(ctx, tmp_path):
143152
path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data())
144153
ctx.register_parquet("t", path)
@@ -494,7 +503,6 @@ def test_register_listing_table(
494503

495504
dir_root = f"file://{dir_root}/" if path_to_str else dir_root
496505

497-
ctx.register_object_store("file://local", LocalFileSystem(), None)
498506
ctx.register_listing_table(
499507
"my_table",
500508
dir_root,

python/tests/test_store.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,15 @@
1616
# under the License.
1717

1818
import os
19+
1920
import pytest
2021

2122
from datafusion import SessionContext
22-
from datafusion.object_store import LocalFileSystem
23-
24-
25-
@pytest.fixture
26-
def local():
27-
return LocalFileSystem()
2823

2924

3025
@pytest.fixture
31-
def ctx(local):
26+
def ctx():
3227
ctx = SessionContext()
33-
ctx.register_object_store("file://local", local, None)
3428
return ctx
3529

3630

python/tests/test_wrapper_coverage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def missing_exports(internal_obj, wrapped_obj) -> None:
5555

5656

5757
def test_datafusion_missing_exports() -> None:
58-
"""Check for any missing pythone exports.
58+
"""Check for any missing python exports.
5959
6060
This test verifies that every exposed class, attribute, and function in
6161
the internal (pyo3) module is also exposed in our python wrappers.

src/context.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ impl PySessionContext {
312312
StorageContexts::GoogleCloudStorage(gcs) => (gcs.inner, gcs.bucket_name),
313313
StorageContexts::MicrosoftAzure(azure) => (azure.inner, azure.container_name),
314314
StorageContexts::LocalFileSystem(local) => (local.inner, "".to_string()),
315+
StorageContexts::HTTP(http) => (http.store, http.url),
315316
};
316317

317318
// let users override the host to match the api signature from upstream

src/store.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,18 @@ use pyo3::prelude::*;
2222
use object_store::aws::{AmazonS3, AmazonS3Builder};
2323
use object_store::azure::{MicrosoftAzure, MicrosoftAzureBuilder};
2424
use object_store::gcp::{GoogleCloudStorage, GoogleCloudStorageBuilder};
25+
use object_store::http::{HttpBuilder, HttpStore};
2526
use object_store::local::LocalFileSystem;
27+
use pyo3::exceptions::PyValueError;
28+
use url::Url;
2629

2730
#[derive(FromPyObject)]
2831
pub enum StorageContexts {
2932
AmazonS3(PyAmazonS3Context),
3033
GoogleCloudStorage(PyGoogleCloudContext),
3134
MicrosoftAzure(PyMicrosoftAzureContext),
3235
LocalFileSystem(PyLocalFileSystemContext),
36+
HTTP(PyHttpContext),
3337
}
3438

3539
#[pyclass(name = "LocalFileSystem", module = "datafusion.store", subclass)]
@@ -219,10 +223,37 @@ impl PyAmazonS3Context {
219223
}
220224
}
221225

226+
#[pyclass(name = "Http", module = "datafusion.store", subclass)]
227+
#[derive(Debug, Clone)]
228+
pub struct PyHttpContext {
229+
pub url: String,
230+
pub store: Arc<HttpStore>,
231+
}
232+
233+
#[pymethods]
234+
impl PyHttpContext {
235+
#[new]
236+
fn new(url: String) -> PyResult<Self> {
237+
let store = match Url::parse(url.as_str()) {
238+
Ok(url) => HttpBuilder::new()
239+
.with_url(url.origin().ascii_serialization())
240+
.build(),
241+
Err(_) => HttpBuilder::new().build(),
242+
}
243+
.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string())))?;
244+
245+
Ok(Self {
246+
url,
247+
store: Arc::new(store),
248+
})
249+
}
250+
}
251+
222252
pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
223253
m.add_class::<PyAmazonS3Context>()?;
224254
m.add_class::<PyMicrosoftAzureContext>()?;
225255
m.add_class::<PyGoogleCloudContext>()?;
226256
m.add_class::<PyLocalFileSystemContext>()?;
257+
m.add_class::<PyHttpContext>()?;
227258
Ok(())
228259
}

0 commit comments

Comments
 (0)