Skip to content

Commit 80d670f

Browse files
feat: Add protocol aliases for IOConfig (#6252)
## Changes Made Adds **protocol aliases** to `IOConfig`: user-defined mappings from custom scheme names to existing schemes. For example, `"my-s3" -> "s3"` lets organizations use domain-specific protocol names that route to standard backends (including native S3, Azure, GCS — not just OpenDAL). **Python API:** ```python io_config = IOConfig( protocol_aliases={"my-s3": "s3", "company-store": "gcs"}, s3=S3Config(endpoint_url="https://my-proprietary-endpoint.example.com"), ) daft.read_parquet("my-s3://bucket/path", io_config=io_config) ``` ### Implementation - **`src/common/io-config/src/config.rs`** — Added `protocol_aliases: BTreeMap<String, String>` field to `IOConfig`, display support, and `validate_protocol_aliases()` that rejects alias keys matching built-in schemes. - **`src/daft-io/src/lib.rs`** — Added `resolve_url_alias()` using `Cow` for zero-allocation on the common (no-alias) path. Integrated into `get_source_and_path()`, `single_url_get()`, `single_url_put()`, and `single_url_get_size()`. Added 7 Rust unit tests. - **`src/common/io-config/src/python.rs`** — Added `protocol_aliases` parameter to `IOConfig::new()` and `replace()` with case normalization and validation. Added getter. - **`daft/daft/__init__.pyi`** — Updated type stubs. - **`tests/io/test_protocol_aliases.py`** — 9 config tests + 2 integration tests using OpenDAL `fs` backend. ### Design Decisions - **Single-level resolution** — no chaining, avoids infinite loops - **Built-in scheme protection** — aliasing `s3`, `gcs`, etc. as keys is rejected at construction time - **Case-insensitive** — consistent with `parse_url()` which already lowercases schemes - **Minimal change surface** — `parse_url()` and its 17+ external callers remain untouched; alias resolution happens in `IOClient` methods before calling `parse_url()` ## Related Issues Builds on PR #6177 (OpenDAL support). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent cd2953d commit 80d670f

File tree

5 files changed

+327
-72
lines changed

5 files changed

+327
-72
lines changed

daft/daft/__init__.pyi

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -995,6 +995,7 @@ class IOConfig:
995995
gravitino: GravitinoConfig
996996
cos: CosConfig
997997
opendal_backends: dict[str, dict[str, str]]
998+
protocol_aliases: dict[str, str]
998999

9991000
def __init__(
10001001
self,
@@ -1009,6 +1010,7 @@ class IOConfig:
10091010
gravitino: GravitinoConfig | None = None,
10101011
cos: CosConfig | None = None,
10111012
opendal_backends: dict[str, dict[str, str]] | None = None,
1013+
protocol_aliases: dict[str, str] | None = None,
10121014
): ...
10131015
def replace(
10141016
self,
@@ -1023,6 +1025,7 @@ class IOConfig:
10231025
gravitino: GravitinoConfig | None = None,
10241026
cos: CosConfig | None = None,
10251027
opendal_backends: dict[str, dict[str, str]] | None = None,
1028+
protocol_aliases: dict[str, str] | None = None,
10261029
) -> IOConfig:
10271030
"""Replaces values if provided, returning a new IOConfig."""
10281031
...

src/common/io-config/src/config.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ pub struct IOConfig {
2525
/// Additional backends configured via OpenDAL.
2626
/// Keys are scheme names (e.g. "oss", "cos"), values are key-value config maps.
2727
pub opendal_backends: BTreeMap<String, BTreeMap<String, String>>,
28+
/// Protocol aliases: maps custom scheme names to existing scheme names.
29+
/// For example, {"my-s3": "s3"} rewrites "my-s3://bucket/path" to "s3://bucket/path".
30+
pub protocol_aliases: BTreeMap<String, String>,
2831
}
2932

3033
impl IOConfig {
@@ -74,8 +77,28 @@ impl IOConfig {
7477
if !self.opendal_backends.is_empty() {
7578
res.push(format!("OpenDAL backends = {:?}", self.opendal_backends));
7679
}
80+
if !self.protocol_aliases.is_empty() {
81+
res.push(format!("Protocol aliases = {:?}", self.protocol_aliases));
82+
}
7783
res
7884
}
85+
86+
/// Validates that no protocol alias key shadows a built-in scheme.
87+
pub fn validate_protocol_aliases(&self) -> std::result::Result<(), String> {
88+
const BUILTIN_SCHEMES: &[&str] = &[
89+
"file", "http", "https", "s3", "s3a", "s3n", "az", "abfs", "abfss", "gcs", "gs", "hf",
90+
"tos", "cos", "cosn", "vol+dbfs", "dbfs", "gvfs",
91+
];
92+
for key in self.protocol_aliases.keys() {
93+
if BUILTIN_SCHEMES.contains(&key.as_str()) {
94+
return Err(format!(
95+
"Protocol alias key '{key}' conflicts with built-in scheme. \
96+
Aliases can only map new custom scheme names to existing schemes."
97+
));
98+
}
99+
}
100+
Ok(())
101+
}
79102
}
80103

81104
impl Display for IOConfig {

src/common/io-config/src/python.rs

Lines changed: 91 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,6 @@ pub struct CosConfig {
214214
#[pymethods]
215215
impl IOConfig {
216216
#[new]
217-
#[must_use]
218217
#[allow(clippy::too_many_arguments)]
219218
#[pyo3(signature = (
220219
s3=None,
@@ -227,7 +226,8 @@ impl IOConfig {
227226
tos=None,
228227
gravitino=None,
229228
cos=None,
230-
opendal_backends=None
229+
opendal_backends=None,
230+
protocol_aliases=None
231231
))]
232232
#[allow(clippy::too_many_arguments)]
233233
pub fn new(
@@ -242,30 +242,36 @@ impl IOConfig {
242242
gravitino: Option<GravitinoConfig>,
243243
cos: Option<CosConfig>,
244244
opendal_backends: Option<HashMap<String, HashMap<String, String>>>,
245-
) -> Self {
246-
Self {
247-
config: config::IOConfig {
248-
s3: s3.unwrap_or_default().config,
249-
azure: azure.unwrap_or_default().config,
250-
gcs: gcs.unwrap_or_default().config,
251-
http: http.unwrap_or_default().config,
252-
unity: unity.unwrap_or_default().config,
253-
hf: hf.unwrap_or_default().config,
254-
disable_suffix_range: disable_suffix_range.unwrap_or_default(),
255-
tos: tos.unwrap_or_default().config,
256-
gravitino: gravitino.unwrap_or_default().config,
257-
cos: cos.unwrap_or_default().config,
258-
opendal_backends: opendal_backends
259-
.unwrap_or_default()
260-
.into_iter()
261-
.map(|(k, v)| (k, v.into_iter().collect()))
262-
.collect(),
263-
},
264-
}
245+
protocol_aliases: Option<HashMap<String, String>>,
246+
) -> PyResult<Self> {
247+
let cfg = config::IOConfig {
248+
s3: s3.unwrap_or_default().config,
249+
azure: azure.unwrap_or_default().config,
250+
gcs: gcs.unwrap_or_default().config,
251+
http: http.unwrap_or_default().config,
252+
unity: unity.unwrap_or_default().config,
253+
hf: hf.unwrap_or_default().config,
254+
disable_suffix_range: disable_suffix_range.unwrap_or_default(),
255+
tos: tos.unwrap_or_default().config,
256+
gravitino: gravitino.unwrap_or_default().config,
257+
cos: cos.unwrap_or_default().config,
258+
opendal_backends: opendal_backends
259+
.unwrap_or_default()
260+
.into_iter()
261+
.map(|(k, v)| (k, v.into_iter().collect()))
262+
.collect(),
263+
protocol_aliases: protocol_aliases
264+
.unwrap_or_default()
265+
.into_iter()
266+
.map(|(k, v)| (k.to_lowercase(), v.to_lowercase()))
267+
.collect(),
268+
};
269+
cfg.validate_protocol_aliases()
270+
.map_err(pyo3::exceptions::PyValueError::new_err)?;
271+
Ok(Self { config: cfg })
265272
}
266273

267274
#[allow(clippy::too_many_arguments)]
268-
#[must_use]
269275
#[pyo3(signature = (
270276
s3=None,
271277
azure=None,
@@ -277,7 +283,8 @@ impl IOConfig {
277283
tos=None,
278284
gravitino=None,
279285
cos=None,
280-
opendal_backends=None
286+
opendal_backends=None,
287+
protocol_aliases=None
281288
))]
282289
#[allow(clippy::too_many_arguments)]
283290
pub fn replace(
@@ -293,47 +300,55 @@ impl IOConfig {
293300
gravitino: Option<GravitinoConfig>,
294301
cos: Option<CosConfig>,
295302
opendal_backends: Option<HashMap<String, HashMap<String, String>>>,
296-
) -> Self {
297-
Self {
298-
config: config::IOConfig {
299-
s3: s3
300-
.map(|s3| s3.config)
301-
.unwrap_or_else(|| self.config.s3.clone()),
302-
azure: azure
303-
.map(|azure| azure.config)
304-
.unwrap_or_else(|| self.config.azure.clone()),
305-
gcs: gcs
306-
.map(|gcs| gcs.config)
307-
.unwrap_or_else(|| self.config.gcs.clone()),
308-
http: http
309-
.map(|http| http.config)
310-
.unwrap_or_else(|| self.config.http.clone()),
311-
unity: unity
312-
.map(|unity| unity.config)
313-
.unwrap_or_else(|| self.config.unity.clone()),
314-
hf: hf
315-
.map(|hf| hf.config)
316-
.unwrap_or_else(|| self.config.hf.clone()),
317-
disable_suffix_range: disable_suffix_range
318-
.unwrap_or(self.config.disable_suffix_range),
319-
tos: tos
320-
.map(|tos| tos.config)
321-
.unwrap_or_else(|| self.config.tos.clone()),
322-
gravitino: gravitino
323-
.map(|gravitino| gravitino.config)
324-
.unwrap_or_else(|| self.config.gravitino.clone()),
325-
cos: cos
326-
.map(|cos| cos.config)
327-
.unwrap_or_else(|| self.config.cos.clone()),
328-
opendal_backends: opendal_backends
329-
.map(|b| {
330-
b.into_iter()
331-
.map(|(k, v)| (k, v.into_iter().collect()))
332-
.collect()
333-
})
334-
.unwrap_or_else(|| self.config.opendal_backends.clone()),
335-
},
336-
}
303+
protocol_aliases: Option<HashMap<String, String>>,
304+
) -> PyResult<Self> {
305+
let cfg = config::IOConfig {
306+
s3: s3
307+
.map(|s3| s3.config)
308+
.unwrap_or_else(|| self.config.s3.clone()),
309+
azure: azure
310+
.map(|azure| azure.config)
311+
.unwrap_or_else(|| self.config.azure.clone()),
312+
gcs: gcs
313+
.map(|gcs| gcs.config)
314+
.unwrap_or_else(|| self.config.gcs.clone()),
315+
http: http
316+
.map(|http| http.config)
317+
.unwrap_or_else(|| self.config.http.clone()),
318+
unity: unity
319+
.map(|unity| unity.config)
320+
.unwrap_or_else(|| self.config.unity.clone()),
321+
hf: hf
322+
.map(|hf| hf.config)
323+
.unwrap_or_else(|| self.config.hf.clone()),
324+
disable_suffix_range: disable_suffix_range.unwrap_or(self.config.disable_suffix_range),
325+
tos: tos
326+
.map(|tos| tos.config)
327+
.unwrap_or_else(|| self.config.tos.clone()),
328+
gravitino: gravitino
329+
.map(|gravitino| gravitino.config)
330+
.unwrap_or_else(|| self.config.gravitino.clone()),
331+
cos: cos
332+
.map(|cos| cos.config)
333+
.unwrap_or_else(|| self.config.cos.clone()),
334+
opendal_backends: opendal_backends
335+
.map(|b| {
336+
b.into_iter()
337+
.map(|(k, v)| (k, v.into_iter().collect()))
338+
.collect()
339+
})
340+
.unwrap_or_else(|| self.config.opendal_backends.clone()),
341+
protocol_aliases: protocol_aliases
342+
.map(|a| {
343+
a.into_iter()
344+
.map(|(k, v)| (k.to_lowercase(), v.to_lowercase()))
345+
.collect()
346+
})
347+
.unwrap_or_else(|| self.config.protocol_aliases.clone()),
348+
};
349+
cfg.validate_protocol_aliases()
350+
.map_err(pyo3::exceptions::PyValueError::new_err)?;
351+
Ok(Self { config: cfg })
337352
}
338353

339354
pub fn __repr__(&self) -> PyResult<String> {
@@ -416,6 +431,17 @@ impl IOConfig {
416431
.collect())
417432
}
418433

434+
/// Protocol aliases mapping custom scheme names to existing schemes
435+
#[getter]
436+
pub fn protocol_aliases(&self) -> PyResult<HashMap<String, String>> {
437+
Ok(self
438+
.config
439+
.protocol_aliases
440+
.iter()
441+
.map(|(k, v)| (k.clone(), v.clone()))
442+
.collect())
443+
}
444+
419445
/// Configuration to be used when accessing COS URLs
420446
#[getter]
421447
pub fn cos(&self) -> PyResult<CosConfig> {

0 commit comments

Comments
 (0)