diff --git a/docs/api/store/index.md b/docs/api/store/index.md index ba12e100..d0de1dcd 100644 --- a/docs/api/store/index.md +++ b/docs/api/store/index.md @@ -1,3 +1,4 @@ # ObjectStore +::: obstore.store.from_url ::: obstore.store.ObjectStore diff --git a/obstore/python/obstore/store/__init__.pyi b/obstore/python/obstore/store/__init__.pyi index 07fb5739..85ed00b4 100644 --- a/obstore/python/obstore/store/__init__.pyi +++ b/obstore/python/obstore/store/__init__.pyi @@ -1,5 +1,6 @@ # TODO: move to reusable types package from pathlib import Path +from typing import Any, Unpack, overload from ._aws import S3Config as S3Config from ._aws import S3Store as S3Store @@ -12,6 +13,83 @@ from ._http import HTTPStore as HTTPStore from ._retry import BackoffConfig as BackoffConfig from ._retry import RetryConfig as RetryConfig +@overload +def from_url( + url: str, + *, + config: S3Config | None = None, + client_options: ClientConfig | None = None, + retry_config: RetryConfig | None = None, + **kwargs: Unpack[S3Config], +) -> ObjectStore: ... +@overload +def from_url( + url: str, + *, + config: GCSConfig | None = None, + client_options: ClientConfig | None = None, + retry_config: RetryConfig | None = None, + **kwargs: Unpack[GCSConfig], +) -> ObjectStore: ... +@overload +def from_url( + url: str, + *, + config: AzureConfig | None = None, + client_options: ClientConfig | None = None, + retry_config: RetryConfig | None = None, + **kwargs: Unpack[AzureConfig], +) -> ObjectStore: ... +@overload +def from_url( + url: str, + *, + config: None = None, + client_options: None = None, + retry_config: None = None, + automatic_cleanup: bool = False, + mkdir: bool = False, +) -> ObjectStore: ... +def from_url( + url: str, + *, + config: S3Config | GCSConfig | AzureConfig | None = None, + client_options: ClientConfig | None = None, + retry_config: RetryConfig | None = None, + **kwargs: Any, +) -> ObjectStore: + """Easy construction of store by URL, identifying the relevant store. + + This will defer to a store-specific `from_url` constructor based on the provided + `url`. E.g. passing `"s3://bucket/path"` will defer to + [`S3Store.from_url`][obstore.store.S3Store.from_url]. + + Supported formats: + + - `file:///path/to/my/file` -> [`LocalStore`][obstore.store.LocalStore] + - `memory:///` -> [`MemoryStore`][obstore.store.MemoryStore] + - `s3://bucket/path` -> [`S3Store`][obstore.store.S3Store] (also supports `s3a`) + - `gs://bucket/path` -> [`GCSStore`][obstore.store.GCSStore] + - `az://account/container/path` -> [`AzureStore`][obstore.store.AzureStore] (also supports `adl`, `azure`, `abfs`, `abfss`) + - `http://mydomain/path` -> [`HTTPStore`][obstore.store.HTTPStore] + - `https://mydomain/path` -> [`HTTPStore`][obstore.store.HTTPStore] + + There are also special cases for AWS and Azure for `https://{host?}/path` paths: + + - `dfs.core.windows.net`, `blob.core.windows.net`, `dfs.fabric.microsoft.com`, `blob.fabric.microsoft.com` -> [`AzureStore`][obstore.store.AzureStore] + - `amazonaws.com` -> [`S3Store`][obstore.store.S3Store] + - `r2.cloudflarestorage.com` -> [`S3Store`][obstore.store.S3Store] + + Args: + url: well-known storage URL. + + Keyword Args: + config: per-store Configuration. Values in this config will override values inferred from the url. Defaults to None. + client_options: HTTP Client options. Defaults to None. + retry_config: Retry configuration. Defaults to None. + + """ + class LocalStore: """ Local filesystem storage providing an ObjectStore interface to files on local disk. diff --git a/obstore/src/signer.rs b/obstore/src/signer.rs index 406cdd17..3e3ed48f 100644 --- a/obstore/src/signer.rs +++ b/obstore/src/signer.rs @@ -13,10 +13,9 @@ use pyo3::exceptions::PyValueError; use pyo3::intern; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; -use pyo3::types::PyString; use pyo3_object_store::{ MaybePrefixedStore, PyAzureStore, PyGCSStore, PyObjectStoreError, PyObjectStoreResult, - PyS3Store, + PyS3Store, PyUrl, }; use url::Url; @@ -139,18 +138,6 @@ impl<'py> FromPyObject<'py> for PyMethod { } } -pub(crate) struct PyUrl(url::Url); - -impl<'py> IntoPyObject<'py> for PyUrl { - type Target = PyString; - type Output = Bound<'py, PyString>; - type Error = std::convert::Infallible; - - fn into_pyobject(self, py: Python<'py>) -> Result { - String::from(self.0).into_pyobject(py) - } -} - #[derive(IntoPyObject)] pub(crate) struct PyUrls(Vec); @@ -174,12 +161,12 @@ pub(crate) fn sign( py.allow_threads(|| match paths { PyPaths::One(path) => { let url = runtime.block_on(store.signed_url(method, &path, expires_in))?; - Ok(PySignResult::One(PyUrl(url))) + Ok(PySignResult::One(PyUrl::new(url))) } PyPaths::Many(paths) => { let urls = runtime.block_on(store.signed_urls(method, &paths, expires_in))?; Ok(PySignResult::Many(PyUrls( - urls.into_iter().map(PyUrl).collect(), + urls.into_iter().map(PyUrl::new).collect(), ))) } }) @@ -201,7 +188,7 @@ pub(crate) fn sign_async( .signed_url(method, &path, expires_in) .await .map_err(PyObjectStoreError::ObjectStoreError)?; - Ok(PySignResult::One(PyUrl(url))) + Ok(PySignResult::One(PyUrl::new(url))) } PyPaths::Many(paths) => { let urls = store @@ -209,7 +196,7 @@ pub(crate) fn sign_async( .await .map_err(PyObjectStoreError::ObjectStoreError)?; Ok(PySignResult::Many(PyUrls( - urls.into_iter().map(PyUrl).collect(), + urls.into_iter().map(PyUrl::new).collect(), ))) } } diff --git a/pyo3-object_store/src/api.rs b/pyo3-object_store/src/api.rs index b57ecbe7..a482810d 100644 --- a/pyo3-object_store/src/api.rs +++ b/pyo3-object_store/src/api.rs @@ -2,7 +2,9 @@ use pyo3::intern; use pyo3::prelude::*; use crate::error::*; -use crate::{PyAzureStore, PyGCSStore, PyHttpStore, PyLocalStore, PyMemoryStore, PyS3Store}; +use crate::{ + from_url, PyAzureStore, PyGCSStore, PyHttpStore, PyLocalStore, PyMemoryStore, PyS3Store, +}; /// Export the default Python API as a submodule named `store` within the given parent module /// @@ -44,6 +46,7 @@ pub fn register_store_module( let child_module = PyModule::new(parent_module.py(), "store")?; + child_module.add_wrapped(wrap_pyfunction!(from_url))?; child_module.add_class::()?; child_module.add_class::()?; child_module.add_class::()?; diff --git a/pyo3-object_store/src/aws.rs b/pyo3-object_store/src/aws.rs index 9dbf3c9d..6a6ae1ea 100644 --- a/pyo3-object_store/src/aws.rs +++ b/pyo3-object_store/src/aws.rs @@ -211,7 +211,7 @@ impl PyS3Store { #[classmethod] #[pyo3(signature = (url, *, config=None, client_options=None, retry_config=None, **kwargs))] - fn from_url( + pub(crate) fn from_url( _cls: &Bound, url: PyUrl, config: Option, @@ -274,6 +274,12 @@ impl<'py> FromPyObject<'py> for PyAmazonS3ConfigKey { } } +impl AsRef for PyAmazonS3ConfigKey { + fn as_ref(&self) -> &str { + self.0.as_ref() + } +} + impl<'py> IntoPyObject<'py> for PyAmazonS3ConfigKey { type Target = PyString; type Output = Bound<'py, PyString>; @@ -284,9 +290,17 @@ impl<'py> IntoPyObject<'py> for PyAmazonS3ConfigKey { } } -#[derive(Clone, Debug, Default, PartialEq, Eq, FromPyObject, IntoPyObject)] +#[derive(Clone, Debug, Default, PartialEq, Eq, IntoPyObject)] pub struct PyAmazonS3Config(HashMap); +// Note: we manually impl FromPyObject instead of deriving it so that we can raise an +// UnknownConfigurationKeyError instead of a `TypeError` on invalid config keys. +impl<'py> FromPyObject<'py> for PyAmazonS3Config { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + Ok(Self(ob.extract()?)) + } +} + impl PyAmazonS3Config { fn apply_config(self, mut builder: AmazonS3Builder) -> AmazonS3Builder { for (key, value) in self.0.into_iter() { diff --git a/pyo3-object_store/src/azure.rs b/pyo3-object_store/src/azure.rs index 688018ac..430f72d4 100644 --- a/pyo3-object_store/src/azure.rs +++ b/pyo3-object_store/src/azure.rs @@ -122,7 +122,7 @@ impl PyAzureStore { #[classmethod] #[pyo3(signature = (url, *, config=None, client_options=None, retry_config=None, **kwargs))] - fn from_url( + pub(crate) fn from_url( _cls: &Bound, url: PyUrl, config: Option, @@ -199,6 +199,12 @@ impl<'py> FromPyObject<'py> for PyAzureConfigKey { } } +impl AsRef for PyAzureConfigKey { + fn as_ref(&self) -> &str { + self.0.as_ref() + } +} + impl<'py> IntoPyObject<'py> for PyAzureConfigKey { type Target = PyString; type Output = Bound<'py, PyString>; @@ -209,9 +215,17 @@ impl<'py> IntoPyObject<'py> for PyAzureConfigKey { } } -#[derive(Clone, Debug, PartialEq, Eq, FromPyObject, IntoPyObject)] +#[derive(Clone, Debug, PartialEq, Eq, IntoPyObject)] pub struct PyAzureConfig(HashMap); +// Note: we manually impl FromPyObject instead of deriving it so that we can raise an +// UnknownConfigurationKeyError instead of a `TypeError` on invalid config keys. +impl<'py> FromPyObject<'py> for PyAzureConfig { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + Ok(Self(ob.extract()?)) + } +} + impl PyAzureConfig { fn apply_config(self, mut builder: MicrosoftAzureBuilder) -> MicrosoftAzureBuilder { for (key, value) in self.0.into_iter() { diff --git a/pyo3-object_store/src/config.rs b/pyo3-object_store/src/config.rs index 0b556962..641cd512 100644 --- a/pyo3-object_store/src/config.rs +++ b/pyo3-object_store/src/config.rs @@ -24,3 +24,9 @@ impl<'py> FromPyObject<'py> for PyConfigValue { } } } + +impl From for String { + fn from(value: PyConfigValue) -> Self { + value.0 + } +} diff --git a/pyo3-object_store/src/gcp.rs b/pyo3-object_store/src/gcp.rs index 37fdb360..03be2031 100644 --- a/pyo3-object_store/src/gcp.rs +++ b/pyo3-object_store/src/gcp.rs @@ -122,7 +122,7 @@ impl PyGCSStore { #[classmethod] #[pyo3(signature = (url, *, config=None, client_options=None, retry_config=None, **kwargs))] - fn from_url( + pub(crate) fn from_url( _cls: &Bound, url: PyUrl, config: Option, @@ -197,6 +197,12 @@ impl<'py> FromPyObject<'py> for PyGoogleConfigKey { } } +impl AsRef for PyGoogleConfigKey { + fn as_ref(&self) -> &str { + self.0.as_ref() + } +} + impl<'py> IntoPyObject<'py> for PyGoogleConfigKey { type Target = PyString; type Output = Bound<'py, PyString>; @@ -207,9 +213,17 @@ impl<'py> IntoPyObject<'py> for PyGoogleConfigKey { } } -#[derive(Clone, Debug, PartialEq, Eq, FromPyObject, IntoPyObject)] +#[derive(Clone, Debug, PartialEq, Eq, IntoPyObject)] pub struct PyGoogleConfig(HashMap); +// Note: we manually impl FromPyObject instead of deriving it so that we can raise an +// UnknownConfigurationKeyError instead of a `TypeError` on invalid config keys. +impl<'py> FromPyObject<'py> for PyGoogleConfig { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + Ok(Self(ob.extract()?)) + } +} + impl PyGoogleConfig { fn apply_config(self, mut builder: GoogleCloudStorageBuilder) -> GoogleCloudStorageBuilder { for (key, value) in self.0.into_iter() { diff --git a/pyo3-object_store/src/http.rs b/pyo3-object_store/src/http.rs index 1985ba45..162d0054 100644 --- a/pyo3-object_store/src/http.rs +++ b/pyo3-object_store/src/http.rs @@ -7,10 +7,10 @@ use pyo3::{intern, IntoPyObjectExt}; use crate::error::PyObjectStoreResult; use crate::retry::PyRetryConfig; -use crate::PyClientOptions; +use crate::{PyClientOptions, PyUrl}; struct HTTPConfig { - url: String, + url: PyUrl, client_options: Option, retry_config: Option, } @@ -59,7 +59,7 @@ impl PyHttpStore { #[new] #[pyo3(signature = (url, *, client_options=None, retry_config=None))] fn new( - url: String, + url: PyUrl, client_options: Option, retry_config: Option, ) -> PyObjectStoreResult { @@ -82,9 +82,9 @@ impl PyHttpStore { #[classmethod] #[pyo3(signature = (url, *, client_options=None, retry_config=None))] - fn from_url( + pub(crate) fn from_url( _cls: &Bound, - url: String, + url: PyUrl, client_options: Option, retry_config: Option, ) -> PyObjectStoreResult { @@ -96,6 +96,6 @@ impl PyHttpStore { } fn __repr__(&self) -> String { - format!("HTTPStore(\"{}\")", &self.config.url) + format!("HTTPStore(\"{}\")", &self.config.url.as_ref()) } } diff --git a/pyo3-object_store/src/lib.rs b/pyo3-object_store/src/lib.rs index 7a15402b..4c35f320 100644 --- a/pyo3-object_store/src/lib.rs +++ b/pyo3-object_store/src/lib.rs @@ -14,6 +14,7 @@ mod memory; mod path; mod prefix; mod retry; +mod simple; mod store; mod url; @@ -27,5 +28,6 @@ pub use http::PyHttpStore; pub use local::PyLocalStore; pub use memory::PyMemoryStore; pub use prefix::MaybePrefixedStore; +pub use simple::from_url; pub use store::PyObjectStore; pub use url::PyUrl; diff --git a/pyo3-object_store/src/local.rs b/pyo3-object_store/src/local.rs index 308e0fa2..c4823264 100644 --- a/pyo3-object_store/src/local.rs +++ b/pyo3-object_store/src/local.rs @@ -7,9 +7,9 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{PyDict, PyTuple, PyType}; use pyo3::{intern, IntoPyObjectExt}; -use url::Url; use crate::error::PyObjectStoreResult; +use crate::PyUrl; #[derive(Clone, Debug)] struct LocalConfig { @@ -79,13 +79,13 @@ impl PyLocalStore { #[classmethod] #[pyo3(signature = (url, *, automatic_cleanup=false, mkdir=false))] - fn from_url( + pub(crate) fn from_url( _cls: &Bound, - url: &str, + url: PyUrl, automatic_cleanup: bool, mkdir: bool, ) -> PyObjectStoreResult { - let url = Url::parse(url).map_err(|err| PyValueError::new_err(err.to_string()))?; + let url = url.into_inner(); let (scheme, path) = ObjectStoreScheme::parse(&url).map_err(object_store::Error::from)?; if !matches!(scheme, ObjectStoreScheme::Local) { diff --git a/pyo3-object_store/src/memory.rs b/pyo3-object_store/src/memory.rs index 6e689511..996220e7 100644 --- a/pyo3-object_store/src/memory.rs +++ b/pyo3-object_store/src/memory.rs @@ -15,6 +15,12 @@ impl AsRef> for PyMemoryStore { } } +impl From> for PyMemoryStore { + fn from(value: Arc) -> Self { + Self(value) + } +} + impl<'py> PyMemoryStore { /// Consume self and return the underlying [`InMemory`]. pub fn into_inner(self) -> Arc { diff --git a/pyo3-object_store/src/simple.rs b/pyo3-object_store/src/simple.rs new file mode 100644 index 00000000..7984cf13 --- /dev/null +++ b/pyo3-object_store/src/simple.rs @@ -0,0 +1,121 @@ +use std::sync::Arc; + +use object_store::memory::InMemory; +use object_store::ObjectStoreScheme; +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyType}; +use pyo3::{intern, IntoPyObjectExt}; + +use crate::error::ObstoreError; +use crate::retry::PyRetryConfig; +use crate::url::PyUrl; +use crate::{ + PyAzureStore, PyClientOptions, PyGCSStore, PyHttpStore, PyLocalStore, PyMemoryStore, + PyObjectStoreResult, PyS3Store, +}; + +/// Simple construction of stores by url. +// Note: We don't extract the PyObject in the function signature because it's possible that +// AWS/Azure/Google config keys could overlap. And so we don't want to accidentally parse a config +// as an AWS config before knowing that the URL scheme is AWS. +#[pyfunction] +#[pyo3(signature = (url, *, config=None, client_options=None, retry_config=None, **kwargs))] +pub fn from_url( + py: Python, + url: PyUrl, + config: Option>, + client_options: Option, + retry_config: Option, + kwargs: Option>, +) -> PyObjectStoreResult { + let (scheme, _) = ObjectStoreScheme::parse(&url.as_ref()).map_err(object_store::Error::from)?; + match scheme { + ObjectStoreScheme::AmazonS3 => { + let store = PyS3Store::from_url( + &PyType::new::(py), + url, + config.map(|x| x.extract()).transpose()?, + client_options, + retry_config, + kwargs.map(|x| x.extract()).transpose()?, + )?; + Ok(store.into_pyobject(py)?.into_py_any(py)?) + } + ObjectStoreScheme::GoogleCloudStorage => { + let store = PyGCSStore::from_url( + &PyType::new::(py), + url, + config.map(|x| x.extract()).transpose()?, + client_options, + retry_config, + kwargs.map(|x| x.extract()).transpose()?, + )?; + Ok(store.into_pyobject(py)?.into_py_any(py)?) + } + ObjectStoreScheme::MicrosoftAzure => { + let store = PyAzureStore::from_url( + &PyType::new::(py), + url, + config.map(|x| x.extract()).transpose()?, + client_options, + retry_config, + kwargs.map(|x| x.extract()).transpose()?, + )?; + Ok(store.into_pyobject(py)?.into_py_any(py)?) + } + ObjectStoreScheme::Http => { + raise_if_config_passed(config, kwargs, "http")?; + let store = PyHttpStore::from_url( + &PyType::new::(py), + url, + client_options, + retry_config, + )?; + Ok(store.into_pyobject(py)?.into_py_any(py)?) + } + ObjectStoreScheme::Local => { + let mut automatic_cleanup = false; + let mut mkdir = false; + if let Some(kwargs) = kwargs { + let kwargs = kwargs.extract::>()?; + if let Some(val) = kwargs.get_item(intern!(py, "automatic_cleanup"))? { + automatic_cleanup = val.extract()?; + } + if let Some(val) = kwargs.get_item(intern!(py, "mkdir"))? { + mkdir = val.extract()?; + } + } + + let store = PyLocalStore::from_url( + &PyType::new::(py), + url, + automatic_cleanup, + mkdir, + )?; + Ok(store.into_pyobject(py)?.into_py_any(py)?) + } + ObjectStoreScheme::Memory => { + raise_if_config_passed(config, kwargs, "memory")?; + let store: PyMemoryStore = Arc::new(InMemory::new()).into(); + Ok(store.into_pyobject(py)?.into_py_any(py)?) + } + scheme => { + return Err(ObstoreError::new_err(format!("Unknown URL scheme {:?}", scheme,)).into()); + } + } +} + +fn raise_if_config_passed( + config: Option>, + kwargs: Option>, + scheme: &str, +) -> PyObjectStoreResult<()> { + if config.is_some() || kwargs.is_some() { + return Err(ObstoreError::new_err(format!( + "Cannot pass config or keyword parameters for scheme {:?}", + scheme, + )) + .into()); + } + Ok(()) +} diff --git a/pyo3-object_store/src/url.rs b/pyo3-object_store/src/url.rs index 71c1d61a..ed7ac5b3 100644 --- a/pyo3-object_store/src/url.rs +++ b/pyo3-object_store/src/url.rs @@ -1,6 +1,7 @@ use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; -use pyo3::types::PyAnyMethods; +use pyo3::types::{PyAnyMethods, PyString}; use pyo3::FromPyObject; use url::Url; @@ -9,7 +10,12 @@ use url::Url; pub struct PyUrl(Url); impl PyUrl { - /// Return the underlying [Url] + /// Create a new PyUrl from a [Url] + pub fn new(url: Url) -> Self { + Self(url) + } + + /// Consume self and return the underlying [Url] pub fn into_inner(self) -> Url { self.0 } @@ -23,6 +29,16 @@ impl<'py> FromPyObject<'py> for PyUrl { } } +impl<'py> IntoPyObject<'py> for PyUrl { + type Target = PyString; + type Output = Bound<'py, PyString>; + type Error = std::convert::Infallible; + + fn into_pyobject(self, py: Python<'py>) -> Result { + String::from(self.0).into_pyobject(py) + } +} + impl AsRef for PyUrl { fn as_ref(&self) -> &Url { &self.0 diff --git a/tests/store/test_from_url.py b/tests/store/test_from_url.py new file mode 100644 index 00000000..8d3b8f82 --- /dev/null +++ b/tests/store/test_from_url.py @@ -0,0 +1,55 @@ +from pathlib import Path + +import pytest + +from obstore.exceptions import ObstoreError, UnknownConfigurationKeyError +from obstore.store import from_url + + +def test_local(): + cwd = Path(".").absolute() + url = f"file://{cwd}" + _store = from_url(url) + + +def test_memory(): + url = "memory:///" + _store = from_url(url) + + with pytest.raises(ObstoreError): + from_url(url, aws_access_key_id="test") + + +def test_s3_params(): + from_url( + "s3://bucket/path", + access_key_id="access_key_id", + secret_access_key="secret_access_key", + ) + + with pytest.raises(UnknownConfigurationKeyError): + from_url("s3://bucket/path", azure_authority_id="") + + +def test_gcs_params(): + # Just to test the params. In practice, the bucket shouldn't be passed + from_url("gs://test.example.com/path", google_bucket="test_bucket") + + with pytest.raises(UnknownConfigurationKeyError): + from_url("gs://test.example.com/path", azure_authority_id="") + + +def test_azure_params(): + url = "abfs://container@account.dfs.core.windows.net/path" + from_url(url, azure_skip_signature=True) + + with pytest.raises(UnknownConfigurationKeyError): + from_url(url, aws_bucket="test") + + +def test_http(): + url = "https://mydomain/path" + from_url(url) + + with pytest.raises(ObstoreError): + from_url(url, aws_bucket="test") diff --git a/tests/store/test_s3.py b/tests/store/test_s3.py index fd1d3174..00fc4688 100644 --- a/tests/store/test_s3.py +++ b/tests/store/test_s3.py @@ -5,7 +5,7 @@ import obstore as obs from obstore.exceptions import ObstoreError -from obstore.store import S3Store +from obstore.store import S3Store, from_url @pytest.mark.asyncio @@ -38,6 +38,16 @@ def test_error_overlapping_config_kwargs(): S3Store("bucket", config={"AWS_SKIP_SIGNATURE": True}, skip_signature=True) +@pytest.mark.asyncio +async def test_from_url(): + store = from_url( + "s3://ookla-open-data/parquet/performance/type=fixed/year=2024/quarter=1", + region="us-west-2", + skip_signature=True, + ) + _meta = await obs.head_async(store, "2024-01-01_performance_fixed_tiles.parquet") + + def test_pickle(): store = S3Store( "ookla-open-data",