Skip to content

Commit b50a4b7

Browse files
committed
add option to preserve empty URL paths
1 parent cbe2dd2 commit b50a4b7

File tree

6 files changed

+336
-102
lines changed

6 files changed

+336
-102
lines changed

python/pydantic_core/core_schema.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3904,6 +3904,7 @@ def url_schema(
39043904
default_host: str | None = None,
39053905
default_port: int | None = None,
39063906
default_path: str | None = None,
3907+
preserve_empty_path: bool | None = None,
39073908
strict: bool | None = None,
39083909
ref: str | None = None,
39093910
metadata: dict[str, Any] | None = None,
@@ -3928,6 +3929,7 @@ def url_schema(
39283929
default_host: The default host to use if the URL does not have a host
39293930
default_port: The default port to use if the URL does not have a port
39303931
default_path: The default path to use if the URL does not have a path
3932+
preserve_empty_path: Whether to preserve an empty path or convert it to '/', default False
39313933
strict: Whether to use strict URL parsing
39323934
ref: optional unique identifier of the schema, used to reference the schema in other places
39333935
metadata: Any other information you want to include with the schema, not used by pydantic-core
@@ -3941,6 +3943,7 @@ def url_schema(
39413943
default_host=default_host,
39423944
default_port=default_port,
39433945
default_path=default_path,
3946+
preserve_empty_path=preserve_empty_path,
39443947
strict=strict,
39453948
ref=ref,
39463949
metadata=metadata,
@@ -3970,6 +3973,7 @@ def multi_host_url_schema(
39703973
default_host: str | None = None,
39713974
default_port: int | None = None,
39723975
default_path: str | None = None,
3976+
preserve_empty_path: bool | None = None,
39733977
strict: bool | None = None,
39743978
ref: str | None = None,
39753979
metadata: dict[str, Any] | None = None,
@@ -3994,6 +3998,7 @@ def multi_host_url_schema(
39943998
default_host: The default host to use if the URL does not have a host
39953999
default_port: The default port to use if the URL does not have a port
39964000
default_path: The default path to use if the URL does not have a path
4001+
preserve_empty_path: Whether to preserve an empty path or convert it to '/', default False
39974002
strict: Whether to use strict URL parsing
39984003
ref: optional unique identifier of the schema, used to reference the schema in other places
39994004
metadata: Any other information you want to include with the schema, not used by pydantic-core
@@ -4007,6 +4012,7 @@ def multi_host_url_schema(
40074012
default_host=default_host,
40084013
default_port=default_port,
40094014
default_path=default_path,
4015+
preserve_empty_path=preserve_empty_path,
40104016
strict=strict,
40114017
ref=ref,
40124018
metadata=metadata,

src/serializers/infer.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -197,11 +197,11 @@ pub(crate) fn infer_to_python_known(
197197
}
198198
ObType::Url => {
199199
let py_url: PyUrl = value.extract()?;
200-
py_url.__str__().into_py_any(py)?
200+
py_url.__str__(py).into_py_any(py)?
201201
}
202202
ObType::MultiHostUrl => {
203203
let py_url: PyMultiHostUrl = value.extract()?;
204-
py_url.__str__().into_py_any(py)?
204+
py_url.__str__(py).into_py_any(py)?
205205
}
206206
ObType::Uuid => {
207207
let uuid = super::type_serializers::uuid::uuid_to_string(value)?;
@@ -476,11 +476,11 @@ pub(crate) fn infer_serialize_known<S: Serializer>(
476476
}
477477
ObType::Url => {
478478
let py_url: PyUrl = value.extract().map_err(py_err_se_err)?;
479-
serializer.serialize_str(py_url.__str__())
479+
serializer.serialize_str(py_url.__str__(value.py()))
480480
}
481481
ObType::MultiHostUrl => {
482482
let py_url: PyMultiHostUrl = value.extract().map_err(py_err_se_err)?;
483-
serializer.serialize_str(&py_url.__str__())
483+
serializer.serialize_str(&py_url.__str__(value.py()))
484484
}
485485
ObType::PydanticSerializable => {
486486
let py = value.py();
@@ -644,11 +644,11 @@ pub(crate) fn infer_json_key_known<'a>(
644644
}
645645
ObType::Url => {
646646
let py_url: PyUrl = key.extract()?;
647-
Ok(Cow::Owned(py_url.__str__().to_string()))
647+
Ok(Cow::Owned(py_url.__str__(key.py()).to_string()))
648648
}
649649
ObType::MultiHostUrl => {
650650
let py_url: PyMultiHostUrl = key.extract()?;
651-
Ok(Cow::Owned(py_url.__str__()))
651+
Ok(Cow::Owned(py_url.__str__(key.py()).to_string()))
652652
}
653653
ObType::Tuple => {
654654
let mut key_build = super::type_serializers::tuple::KeyBuilder::new();

src/serializers/type_serializers/url.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ macro_rules! build_serializer {
4343
let py = value.py();
4444
match value.extract::<$extract>() {
4545
Ok(py_url) => match extra.mode {
46-
SerMode::Json => py_url.__str__().into_py_any(py),
46+
SerMode::Json => py_url.__str__(value.py()).into_py_any(py),
4747
_ => Ok(value.clone().unbind()),
4848
},
4949
Err(_) => {
@@ -55,7 +55,7 @@ macro_rules! build_serializer {
5555

5656
fn json_key<'a>(&self, key: &'a Bound<'_, PyAny>, extra: &Extra) -> PyResult<Cow<'a, str>> {
5757
match key.extract::<$extract>() {
58-
Ok(py_url) => Ok(Cow::Owned(py_url.__str__().to_string())),
58+
Ok(py_url) => Ok(Cow::Owned(py_url.__str__(key.py()).to_string())),
5959
Err(_) => {
6060
extra.warnings.on_fallback_py(self.get_name(), key, extra)?;
6161
infer_json_key(key, extra)
@@ -72,7 +72,7 @@ macro_rules! build_serializer {
7272
extra: &Extra,
7373
) -> Result<S::Ok, S::Error> {
7474
match value.extract::<$extract>() {
75-
Ok(py_url) => serializer.serialize_str(&py_url.__str__()),
75+
Ok(py_url) => serializer.serialize_str(&py_url.__str__(value.py())),
7676
Err(_) => {
7777
extra
7878
.warnings

src/url.rs

Lines changed: 130 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@ use std::collections::hash_map::DefaultHasher;
22
use std::fmt;
33
use std::fmt::Formatter;
44
use std::hash::{Hash, Hasher};
5+
use std::sync::OnceLock;
56

67
use idna::punycode::decode_to_string;
78
use pyo3::exceptions::PyValueError;
89
use pyo3::pyclass::CompareOp;
9-
use pyo3::sync::GILOnceCell;
10+
use pyo3::sync::{GILOnceCell, OnceLockExt};
1011
use pyo3::types::{PyDict, PyType};
1112
use pyo3::{intern, prelude::*, IntoPyObjectExt};
1213
use url::Url;
@@ -17,35 +18,70 @@ use crate::SchemaValidator;
1718
static SCHEMA_DEFINITION_URL: GILOnceCell<SchemaValidator> = GILOnceCell::new();
1819

1920
#[pyclass(name = "Url", module = "pydantic_core._pydantic_core", subclass, frozen)]
20-
#[derive(Clone, Hash)]
21+
#[derive(Clone)]
2122
#[cfg_attr(debug_assertions, derive(Debug))]
2223
pub struct PyUrl {
2324
lib_url: Url,
25+
/// Whether to serialize the path as empty when it is `/`. The `url` crate always normalizes an empty path to `/`,
26+
/// but users may want to preserve the empty path when round-tripping.
27+
serialize_path_as_empty: bool,
28+
/// Cache for the serialized representation where this diverges from `lib_url.as_str()`
29+
/// (i.e. when trailing slash was added to the empty path, but user didn't want that)
30+
serialized: OnceLock<String>,
31+
}
32+
33+
impl Hash for PyUrl {
34+
fn hash<H: Hasher>(&self, state: &mut H) {
35+
self.lib_url.hash(state);
36+
self.serialize_path_as_empty.hash(state);
37+
// no need to hash `serialized` as it's derived from the other two fields
38+
}
2439
}
2540

2641
impl PyUrl {
27-
pub fn new(lib_url: Url) -> Self {
28-
Self { lib_url }
42+
pub fn new(lib_url: Url, serialize_path_as_empty: bool) -> Self {
43+
Self {
44+
lib_url,
45+
serialize_path_as_empty,
46+
serialized: OnceLock::new(),
47+
}
2948
}
3049

3150
pub fn url(&self) -> &Url {
3251
&self.lib_url
3352
}
34-
}
3553

36-
fn build_schema_validator(py: Python, schema_type: &str) -> SchemaValidator {
37-
let schema = PyDict::new(py);
38-
schema.set_item("type", schema_type).unwrap();
39-
SchemaValidator::py_new(py, &schema, None).unwrap()
54+
pub fn url_mut(&mut self) -> &mut Url {
55+
&mut self.lib_url
56+
}
57+
58+
pub fn serialized(&self, py: Python<'_>) -> &str {
59+
if self.serialize_path_as_empty {
60+
self.serialized
61+
.get_or_init_py_attached(py, || serialize_url_without_path_slash(&self.lib_url))
62+
} else {
63+
self.lib_url.as_str()
64+
}
65+
}
4066
}
4167

4268
#[pymethods]
4369
impl PyUrl {
4470
#[new]
45-
pub fn py_new(py: Python, url: &Bound<'_, PyAny>) -> PyResult<Self> {
46-
let schema_obj = SCHEMA_DEFINITION_URL
47-
.get_or_init(py, || build_schema_validator(py, "url"))
48-
.validate_python(py, url, None, None, None, None, None, false.into(), None, None)?;
71+
#[pyo3(signature = (url, *, preserve_empty_path=false))]
72+
pub fn py_new(py: Python, url: &Bound<'_, PyAny>, preserve_empty_path: bool) -> PyResult<Self> {
73+
let schema_obj = get_schema_validator(py, false, preserve_empty_path)?.validate_python(
74+
py,
75+
url,
76+
None,
77+
None,
78+
None,
79+
None,
80+
None,
81+
false.into(),
82+
None,
83+
None,
84+
)?;
4985
schema_obj.extract(py)
5086
}
5187

@@ -117,12 +153,12 @@ impl PyUrl {
117153
unicode_url(&self.lib_url)
118154
}
119155

120-
pub fn __str__(&self) -> &str {
121-
self.lib_url.as_str()
156+
pub fn __str__(&self, py: Python<'_>) -> &str {
157+
dbg!(self.serialized(py))
122158
}
123159

124-
pub fn __repr__(&self) -> String {
125-
format!("Url('{}')", self.lib_url)
160+
pub fn __repr__(&self, py: Python<'_>) -> String {
161+
format!("Url('{}')", self.serialized(py))
126162
}
127163

128164
fn __richcmp__(&self, other: &Self, op: CompareOp) -> PyResult<bool> {
@@ -151,8 +187,8 @@ impl PyUrl {
151187
self.clone().into_py_any(py)
152188
}
153189

154-
fn __getnewargs__(&self) -> (&str,) {
155-
(self.__str__(),)
190+
fn __getnewargs__(&self, py: Python<'_>) -> (&str,) {
191+
(self.__str__(py),)
156192
}
157193

158194
#[classmethod]
@@ -201,11 +237,8 @@ pub struct PyMultiHostUrl {
201237
}
202238

203239
impl PyMultiHostUrl {
204-
pub fn new(ref_url: Url, extra_urls: Option<Vec<Url>>) -> Self {
205-
Self {
206-
ref_url: PyUrl::new(ref_url),
207-
extra_urls,
208-
}
240+
pub fn new(ref_url: PyUrl, extra_urls: Option<Vec<Url>>) -> Self {
241+
Self { ref_url, extra_urls }
209242
}
210243

211244
pub fn lib_url(&self) -> &Url {
@@ -222,10 +255,20 @@ static SCHEMA_DEFINITION_MULTI_HOST_URL: GILOnceCell<SchemaValidator> = GILOnceC
222255
#[pymethods]
223256
impl PyMultiHostUrl {
224257
#[new]
225-
pub fn py_new(py: Python, url: &Bound<'_, PyAny>) -> PyResult<Self> {
226-
let schema_obj = SCHEMA_DEFINITION_MULTI_HOST_URL
227-
.get_or_init(py, || build_schema_validator(py, "multi-host-url"))
228-
.validate_python(py, url, None, None, None, None, None, false.into(), None, None)?;
258+
#[pyo3(signature = (url, *, preserve_empty_path=false))]
259+
pub fn py_new(py: Python, url: &Bound<'_, PyAny>, preserve_empty_path: bool) -> PyResult<Self> {
260+
let schema_obj = get_schema_validator(py, true, preserve_empty_path)?.validate_python(
261+
py,
262+
url,
263+
None,
264+
None,
265+
None,
266+
None,
267+
None,
268+
false.into(),
269+
None,
270+
None,
271+
)?;
229272
schema_obj.extract(py)
230273
}
231274

@@ -297,7 +340,7 @@ impl PyMultiHostUrl {
297340
}
298341
}
299342

300-
pub fn __str__(&self) -> String {
343+
pub fn __str__(&self, py: Python<'_>) -> String {
301344
if let Some(extra_urls) = &self.extra_urls {
302345
let scheme = self.ref_url.lib_url.scheme();
303346
let host_offset = scheme.len() + 3;
@@ -321,12 +364,12 @@ impl PyMultiHostUrl {
321364
full_url.insert_str(host_offset, &hosts);
322365
full_url
323366
} else {
324-
self.ref_url.__str__().to_string()
367+
self.ref_url.__str__(py).to_string()
325368
}
326369
}
327370

328-
pub fn __repr__(&self) -> String {
329-
format!("MultiHostUrl('{}')", self.__str__())
371+
pub fn __repr__(&self, py: Python<'_>) -> String {
372+
format!("MultiHostUrl('{}')", self.__str__(py))
330373
}
331374

332375
fn __richcmp__(&self, other: &Self, op: CompareOp) -> PyResult<bool> {
@@ -354,8 +397,8 @@ impl PyMultiHostUrl {
354397
self.clone().into_py_any(py)
355398
}
356399

357-
fn __getnewargs__(&self) -> (String,) {
358-
(self.__str__(),)
400+
fn __getnewargs__(&self, py: Python<'_>) -> (String,) {
401+
(self.__str__(py),)
359402
}
360403

361404
#[classmethod]
@@ -517,3 +560,56 @@ fn is_punnycode_domain(lib_url: &Url, domain: &str) -> bool {
517560
pub fn scheme_is_special(scheme: &str) -> bool {
518561
matches!(scheme, "http" | "https" | "ws" | "wss" | "ftp" | "file")
519562
}
563+
564+
fn serialize_url_without_path_slash(url: &Url) -> String {
565+
// use pointer arithmetic to find the pieces we need to build the string
566+
let s = url.as_str();
567+
let path = url.path();
568+
assert_eq!(
569+
path, "/",
570+
"`serialize_path_as_empty` expected to be set only when path is '/'"
571+
);
572+
573+
assert!(
574+
// Safety for the below: `s` and `path` should be from the same text slice, so
575+
// we can pull out the slices of `s` that don't include `path`.
576+
s.as_ptr() <= path.as_ptr() && unsafe { s.as_ptr().add(s.len()) } >= unsafe { path.as_ptr().add(path.len()) }
577+
);
578+
579+
let prefix_len = path.as_ptr() as usize - s.as_ptr() as usize;
580+
let suffix_len = s.len() - (prefix_len + path.len());
581+
582+
// Safety: prefix is the slice of `s` leading to `path`, protected by the assert above.
583+
let prefix = unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(s.as_ptr(), prefix_len)) };
584+
// Safety: suffix is the slice of `s` after `path`, protected by the assert above.
585+
let suffix =
586+
unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(path.as_ptr().add(path.len()), suffix_len)) };
587+
588+
format!("{prefix}{suffix}")
589+
}
590+
591+
static SCHEMA_URL_SINGLE_TRUE: GILOnceCell<SchemaValidator> = GILOnceCell::new();
592+
static SCHEMA_URL_SINGLE_FALSE: GILOnceCell<SchemaValidator> = GILOnceCell::new();
593+
static SCHEMA_URL_MULTI_TRUE: GILOnceCell<SchemaValidator> = GILOnceCell::new();
594+
static SCHEMA_URL_MULTI_FALSE: GILOnceCell<SchemaValidator> = GILOnceCell::new();
595+
596+
macro_rules! make_schema_val {
597+
($py:ident, $schema_type:literal, $preserve_empty_path:literal) => {{
598+
let schema = PyDict::new($py);
599+
schema.set_item(intern!($py, "type"), intern!($py, $schema_type))?;
600+
// preserve_empty_path defaults to false, so only set it if true
601+
if $preserve_empty_path {
602+
schema.set_item(intern!($py, "preserve_empty_path"), true)?;
603+
}
604+
SchemaValidator::py_new($py, &schema, None)
605+
}};
606+
}
607+
608+
fn get_schema_validator(py: Python<'_>, multi_host: bool, preserve_empty_path: bool) -> PyResult<&SchemaValidator> {
609+
match (multi_host, preserve_empty_path) {
610+
(false, true) => SCHEMA_URL_SINGLE_TRUE.get_or_try_init(py, || make_schema_val!(py, "url", true)),
611+
(false, false) => SCHEMA_URL_SINGLE_FALSE.get_or_try_init(py, || make_schema_val!(py, "url", false)),
612+
(true, true) => SCHEMA_URL_MULTI_TRUE.get_or_try_init(py, || make_schema_val!(py, "multi-host-url", true)),
613+
(true, false) => SCHEMA_URL_MULTI_FALSE.get_or_try_init(py, || make_schema_val!(py, "multi-host-url", false)),
614+
}
615+
}

0 commit comments

Comments
 (0)