Skip to content
Draft
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ tests = [
"pytest-mock >=3.12.0",
"pylint >=2.17.4",
"mypy >=1.10.0",
"pydantic >=2",
"pydantic >=2,<3", # <3 required for testing pydantic v1 support, not for actual use
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's not predict the future. It's better if these tests break once pydantiv 3 actually breaks the api we would rely on in here.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this is fair - I can remove the pin but might keep the comment such that when tests inevitably fail when pydantic v3 is released there is something to note why this might be the case? Does that sound suitable?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI here is the version policy indicating the intention to remove pydantic.v1 from v3 onwards.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI here is the version policy indicating the intention to remove pydantic.v1 from v3 onwards.

I am not sure I follow. Where does is say in there that pydantic.v1 won't be available in v3 ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah sorry good point - I might have misread there - but here is the comment from Sam Colvin about intended removal of the V1 shim.

"pytest-mypy-plugins >=3.1.2",
"packaging",
]
Expand Down
68 changes: 58 additions & 10 deletions upath/core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import os
import pathlib
import sys
import warnings
from abc import ABCMeta
Expand All @@ -14,8 +15,10 @@
from typing import TYPE_CHECKING
from typing import Any
from typing import BinaryIO
from typing import Callable
from typing import Literal
from typing import TextIO
from typing import TypedDict
from typing import overload
from urllib.parse import SplitResult
from urllib.parse import urlsplit
Expand All @@ -40,8 +43,10 @@

if TYPE_CHECKING:
if sys.version_info >= (3, 11):
from typing import NotRequired
from typing import Self
else:
from typing_extensions import NotRequired
from typing_extensions import Self

from pydantic import GetCoreSchemaHandler
Expand Down Expand Up @@ -107,6 +112,14 @@ def __getitem__(cls, key):
return cls


class SerializedUPath(TypedDict):
"""Serialized format for a UPath object"""

path: str
protocol: NotRequired[str]
storage_options: NotRequired[dict[str, Any]]


class _UPathMixin(metaclass=_UPathMeta):
__slots__ = ()

Expand Down Expand Up @@ -179,6 +192,13 @@ def path(self) -> str:
"""The path that a fsspec filesystem can use."""
return self.parser.strip_protocol(self.__str__())

def to_dict(self) -> SerializedUPath:
return {
"path": self.path,
"protocol": self.protocol,
"storage_options": dict(self.storage_options),
}

Comment on lines +195 to +201
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is to_dict() needed? It does not seem to be a standard v1 BaseModel method.

Let's not extend the public UPath class API for pydantic v1 support

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah thats fair enough. I wanted to include a convenience method for enabling JSON serialization in pydantic v1 models. At present, there would not be a way to serialize this class that a V1 model would utilize - for example doing:

from pydantic.v1 import BaseModel

class Model(BaseModel):

    path: UPath


model = Model(path="s3://bucket/path/to/key/")

model.json() # hits TypeError: Object of type 'S3Path' is not JSON serializable

whereas if I do:

class Model(BaseModel):
    path: UPath
        class Config:
            json_encoders = {UPath: UPath.to_dict}

model = Model(path="s3://bucket/path/to/key/")
model.json() # == '{"path": {"path": "bucket/path/to/key", "protocol": "s3", "storage_options": {}}}'

So its mainly a way to expose an easier serialization method that can be used with v1 models - unfortunately the serialization format for v2 is not used with v1 models (and you can't define a hook method on the custom class to provide a standardized serialization method for the class 😭, its solely the responsibility of the v1 model that uses the custom class), so I thought it might be a useful idea to include a serialization method to the class to provide an easier method to use as a json encoder hook. Totally okay with removing it though if you think this is too much over-reach of this PR.

Copy link
Collaborator

@ap-- ap-- Aug 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm...

We could introduce a general serializer helper function in universal-pathlib, that could then be used.

In principle this would be:

serialize_upath = upath.UPath.__get_pydantic_core_schema__(None, **None)['serialization']['function']

Thinking about this, that might be the best idea. Something similar will land with chaining support: https://github.com/fsspec/universal_pathlib/pull/346/files#diff-79e86edc0bf259fa57cede9d23833d37576186abbdfe51591786da661c2620dfR28-R31

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but wouldn't this enforce that the serialization method requires installation of pydantic >= 2 to execute? Since the .__get_pydantic_core_schema__ imports from a pydantic v2 import path to generate the schema first.

I agree I would like to_dict to be executed akin to TypeAdapter(UPath).dump_python(upath, mode="json") but didn't want to enforce installation of pydantic v2 just for dictifying the a UPath (especially since in the tests there are scenarios where JSON serialization would fail anyway if some storage_options are not serializable anyway, so figured it probably didn't make sense / matter to exactly use the TypeAdapter pattern (it would just be softly asking for an additional dependency to be added [softly because the pydantic v2 import would be within the serialization function, or in __get_pydantic_core_schema__ etc]).

But perhaps thats not a big deal - if you want to serialize a UPath - you need pydantic>=2 🤷

def joinuri(self, uri: JoinablePathLike) -> UPath:
"""Join with urljoin behavior for UPath instances"""
# short circuit if the new uri uses a different protocol
Expand Down Expand Up @@ -946,9 +966,7 @@ def __get_pydantic_core_schema__(

deserialization_schema = core_schema.chain_schema(
[
core_schema.no_info_plain_validator_function(
lambda v: {"path": v} if isinstance(v, str) else v,
),
core_schema.no_info_plain_validator_function(cls._to_serialized_format),
core_schema.typed_dict_schema(
{
"path": core_schema.typed_dict_field(
Expand All @@ -973,13 +991,7 @@ def __get_pydantic_core_schema__(
},
extra_behavior="forbid",
),
core_schema.no_info_plain_validator_function(
lambda dct: cls(
dct.pop("path"),
protocol=dct.pop("protocol"),
**dct["storage_options"],
)
),
core_schema.no_info_plain_validator_function(cls._validate),
]
)

Expand All @@ -998,3 +1010,39 @@ def __get_pydantic_core_schema__(
),
serialization=serialization_schema,
)

@classmethod
def __get_validators__(cls) -> Iterator[Callable]:
yield cls._validate
Comment on lines +1014 to +1016
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could support for v1 be added by only introducing __get_validators__ ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mentioned in a comment below but linked again for good measure - ultimately __get_validators__ requires yielding callable validation methods / functions that are used during validation - so we need to yield a callable still.


@staticmethod
def _to_serialized_format(
v: str | pathlib.Path | _UPathMixin | dict[str, Any]
) -> SerializedUPath:
if isinstance(v, _UPathMixin):
return v.to_dict()
if isinstance(v, dict):
return {
"path": v["path"],
"protocol": v.get("protocol", ""),
"storage_options": v.get("storage_options", {}),
}
if isinstance(v, pathlib.Path):
return {"path": v.as_posix(), "protocol": ""}
if isinstance(v, str):
return {
"path": v,
}
raise TypeError(f"Invalid path: {v!r}")

@classmethod
def _validate(cls, v: Any) -> UPath:
if not isinstance(v, UPath):
v = cls._to_serialized_format(v)

return cls(
v["path"],
protocol=v.get("protocol"),
**v.get("storage_options", {}), # type: ignore[arg-type]
)
return v
Comment on lines +1038 to +1048
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this classmethod needed specifically?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what is actually executed during v1 validation - I added this as a function such that now both v1 and v2 actually run this method when executing validation. I believe the v2 validation used lambda's beforehand depending on the input type within __get_pydantic_core_schema__, whereas this is a single source of truth for both v1 and v2 validation with this PR which imo is cleaner than implementing the validation procedure twice for the different versions.

LMK what you think though - I can change it back to having two separate procedures for v1 and v2 if need be.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me rephrase: Why does this have to be attached to the class interface. I.e. this could be a separate helper function for v1 support living outside of the UPath class.

I can change it back to having two separate procedures for v1 and v2 if need be.

I'd prefer that. It'll make maintenance easier down the line, when v1 support is dropped eventually.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I guess my point is that encapsulating this logic within one method that is used by both v1 (__get_validators__) and v2 (__get_pydantic_core_schema__) means that this logic is not necessarily just 'v1' logic. The only 'v1' logic that would be attached to the class is the __get_validators__ method ultimately. I believe having this validation entirely separate will make for more of a maintenance burden / require more code to be removed if v1 support is dropped.

But okay, so you would prefer having a separate function that is not a classmethod that undertakes this validation and have it separate to the v2 validation?

89 changes: 64 additions & 25 deletions upath/tests/test_pydantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,39 @@
from os.path import abspath

import pydantic
import pydantic.v1 as pydantic_v1
import pydantic_core
import pytest
from fsspec.implementations.http import get_client

from upath import UPath


@pytest.fixture(params=["v1", "v2"])
def pydantic_version(request):
return request.param


@pytest.fixture(params=["json", "python"])
def source(request):
return request.param


@pytest.fixture
def parser(pydantic_version, source):
if pydantic_version == "v1":
if source == "json":
return lambda x: pydantic_v1.tools.parse_raw_as(UPath, x)
else:
return lambda x: pydantic_v1.tools.parse_obj_as(UPath, x)
else:
ta = pydantic.TypeAdapter(UPath)
if source == "json":
return ta.validate_json
else:
return ta.validate_python


@pytest.mark.parametrize(
"path",
[
Expand All @@ -19,15 +45,13 @@
"https://www.example.com",
],
)
@pytest.mark.parametrize("source", ["json", "python"])
def test_validate_from_str(path, source):
def test_validate_from_str(path, source, parser):
expected = UPath(path)

ta = pydantic.TypeAdapter(UPath)
if source == "json":
actual = ta.validate_json(json.dumps(path))
else: # source == "python"
actual = ta.validate_python(path)
path = json.dumps(path)

actual = parser(path)

assert abspath(actual.path) == abspath(expected.path)
assert actual.protocol == expected.protocol
Expand All @@ -43,13 +67,13 @@ def test_validate_from_str(path, source):
}
],
)
@pytest.mark.parametrize("source", ["json", "python"])
def test_validate_from_dict(dct, source):
ta = pydantic.TypeAdapter(UPath)
def test_validate_from_dict(dct, source, parser):
if source == "json":
output = ta.validate_json(json.dumps(dct))
else: # source == "python"
output = ta.validate_python(dct)
data = json.dumps(dct)
else:
data = dct

output = parser(data)

assert abspath(output.path) == abspath(dct["path"])
assert output.protocol == dct["protocol"]
Expand All @@ -66,10 +90,13 @@ def test_validate_from_dict(dct, source):
"https://www.example.com",
],
)
def test_validate_from_instance(path):
def test_validate_from_instance(path, pydantic_version):
input = UPath(path)

output = pydantic.TypeAdapter(UPath).validate_python(input)
if pydantic_version == "v1":
output = pydantic_v1.tools.parse_obj_as(UPath, input)
else:
output = pydantic.TypeAdapter(UPath).validate_python(input)

assert output is input

Expand All @@ -88,26 +115,38 @@ def test_validate_from_instance(path):
],
)
@pytest.mark.parametrize("mode", ["json", "python"])
def test_dump(args, kwargs, mode):
def test_dump(args, kwargs, mode, pydantic_version):
u = UPath(*args, **kwargs)

output = pydantic.TypeAdapter(UPath).dump_python(u, mode=mode)
if pydantic_version == "v1":
output = u.to_dict()
else:
output = pydantic.TypeAdapter(UPath).dump_python(u, mode=mode)

assert output["path"] == u.path
assert output["protocol"] == u.protocol
assert output["storage_options"] == u.storage_options


def test_dump_non_serializable_python():
output = pydantic.TypeAdapter(UPath).dump_python(
UPath("https://www.example.com", get_client=get_client), mode="python"
)
def test_dump_non_serializable_python(pydantic_version):
upath = UPath("https://www.example.com", get_client=get_client)

if pydantic_version == "v1":
output = upath.to_dict()
else:
output = pydantic.TypeAdapter(UPath).dump_python(upath, mode="python")

assert output["storage_options"]["get_client"] is get_client


def test_dump_non_serializable_json():
with pytest.raises(pydantic_core.PydanticSerializationError, match="unknown type"):
pydantic.TypeAdapter(UPath).dump_python(
UPath("https://www.example.com", get_client=get_client), mode="json"
)
def test_dump_non_serializable_json(pydantic_version):
upath = UPath("https://www.example.com", get_client=get_client)

if pydantic_version == "v1":
with pytest.raises(TypeError, match="not JSON serializable"):
json.dumps(upath.to_dict())
else:
with pytest.raises(
pydantic_core.PydanticSerializationError, match="unknown type"
):
pydantic.TypeAdapter(UPath).dump_python(upath, mode="json")
Loading