Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions src/datachain/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,10 +681,7 @@ def _row_to_node(d: dict[str, Any]) -> Node:
for src in sources: # Opt: parallel
listing: Optional[Listing]
if src.startswith("ds://"):
ds_name, ds_version = parse_dataset_uri(src)
ds_namespace, ds_project, ds_name = parse_dataset_name(ds_name)
assert ds_namespace
assert ds_project
(ds_namespace, ds_project, ds_name, ds_version) = parse_dataset_uri(src)
dataset = self.get_dataset(
ds_name, namespace_name=ds_namespace, project_name=ds_project
)
Expand Down Expand Up @@ -1515,13 +1512,12 @@ def _instantiate(ds_uri: str) -> None:
studio_client = StudioClient()

try:
remote_ds_name, version = parse_dataset_uri(remote_ds_uri)
(remote_namespace, remote_project, remote_ds_name, version) = (
parse_dataset_uri(remote_ds_uri)
)
except Exception as e:
raise DataChainError("Error when parsing dataset uri") from e
Comment on lines 1514 to 1519
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Exception handling could be more specific than a generic Exception.

Catching Exception may hide unrelated errors. Catch ValueError or a custom exception from parse_dataset_uri instead.

Suggested change
try:
remote_ds_name, version = parse_dataset_uri(remote_ds_uri)
(remote_namespace, remote_project, remote_ds_name, version) = (
parse_dataset_uri(remote_ds_uri)
)
except Exception as e:
raise DataChainError("Error when parsing dataset uri") from e
try:
(remote_namespace, remote_project, remote_ds_name, version) = (
parse_dataset_uri(remote_ds_uri)
)
except ValueError as e:
raise DataChainError("Error when parsing dataset uri") from e


remote_namespace, remote_project, remote_ds_name = parse_dataset_name(
remote_ds_name
)
if not remote_namespace or not remote_project:
raise DataChainError(
f"Invalid fully qualified dataset name {remote_ds_name}, namespace"
Expand Down
69 changes: 51 additions & 18 deletions src/datachain/dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import builtins
import json
import re
from dataclasses import dataclass, fields
from datetime import datetime
from functools import cached_property
Expand All @@ -10,7 +11,6 @@
TypeVar,
Union,
)
from urllib.parse import urlparse

from packaging.specifiers import SpecifierSet
from packaging.version import Version
Expand Down Expand Up @@ -43,25 +43,58 @@
StorageURI = NewType("StorageURI", str)


def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
def parse_dataset_uri(uri: str) -> tuple[str, str, str, str | None]:
"""
Parse dataser uri to extract name and version out of it (if version is defined)
Example:
Input: ds://[email protected]
Output: (zalando, 3.0.1)
Parse a dataset URI of the form:
ds://<namespace>.<project>.<name>[@v<semver>]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we always require both namespace and project now? It looks like breaking changes 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method was used in places where it was expected to have both, but you are right to make this more generic and leave it as optional.

Components:
- `ds://` : required prefix identifying dataset URIs.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we drop the ds:// by now? do you remember why it was needed at all?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That goes way back ... It was part of the requirements but I don't remember all the reasoning behind it. We can discuss separately as I don't want to remove anything like that in this PR

- `namespace` : required namespace, may start with '@' (e.g., "@user").
- `project` : required project name inside the namespace.
- `name` : required dataset name.
- `@v<semver>` : optional version suffix. Must start with '@v' and
be a semantic version string MAJOR.MINOR.PATCH
(e.g., "1.0.4").
Returns:
tuple[str, str, str, str | None]:
(namespace, project, name, version) where version is None
if not provided.
Raises:
ValueError: if the URI does not start with 'ds://' or does not
match the expected format.
"""
p = urlparse(uri)
if p.scheme != "ds":
raise Exception("Dataset uri should start with ds://")
s = p.netloc.split("@v")
name = s[0]
if len(s) == 1:
return name, None
if len(s) != 2:
raise Exception(
"Wrong dataset uri format, it should be: ds://<name>@v<version>"
)
return name, s[1]

if not uri.startswith("ds://"):
raise ValueError(f"Invalid dataset URI: {uri}")

body = uri[len("ds://") :]

pattern = re.compile(
r"""
^(?P<namespace>@?\w+) # namespace, may start with '@'
\. (?P<project>\w+) # project
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I bet we do allow @ in project name too. Let's check? 🙏

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

@ilongin ilongin Sep 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ is not allowed in project name and dataset name. Also, you cannot create namespace name with it, but we do create automatically namespaces with @<username> so it can appear here. Regardless, I think I think you are right to assume it can be present, as it's safer that way.

\. (?P<name>\w+) # dataset name
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't, those are reserved characters that are not allowed

(?:@v # optional version prefix must be '@v'
(?P<version>\d+\.\d+\.\d+)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I am not mistaken, we still do support single-digit versions (not semver) 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only allow integer versions in dc.read_dataset() to be backward compatible with old user's scripts. In that method we convert integer version to correct string semver and continue with that.
parse_dataset_uri() and other internal methods expect full valid semver.

)?$ # end of string
""",
re.VERBOSE,
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of using regexp, here we can use parse_dataset_name first, to get optional namespace_name, optional project_name, and dataset_name (including version) and then parse dataset_name to split it into name and version 🤔

This way we will not split the logic of parsing full dataset name (including namespace and project) between two different methods (parse_dataset_uri and parse_dataset_name), and will keep it in one place, so next time we will need to change anything related, we will not forget to do so and it will be only one place where changes are required.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactored to set namespace and project as optional and to use parse_dataset_name().


match = pattern.match(body)
if not match:
raise ValueError(f"Invalid dataset URI format: {uri}")

return (
match.group("namespace"),
match.group("project"),
match.group("name"),
match.group("version"),
)


def create_dataset_uri(
Expand Down
4 changes: 2 additions & 2 deletions tests/func/test_pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,9 +286,9 @@ def test_pull_dataset_wrong_version(

with pytest.raises(DataChainError) as exc_info:
catalog.pull_dataset(
f"ds://{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs@v5"
f"ds://{REMOTE_NAMESPACE_NAME}.{REMOTE_PROJECT_NAME}.dogs@v5.0.0"
)
assert str(exc_info.value) == "Dataset dogs doesn't have version 5 on server"
assert str(exc_info.value) == "Dataset dogs doesn't have version 5.0.0 on server"


@pytest.mark.parametrize("cloud_type, version_aware", [("s3", False)], indirect=True)
Expand Down
32 changes: 32 additions & 0 deletions tests/unit/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
DatasetRecord,
DatasetVersion,
parse_dataset_name,
parse_dataset_uri,
)
from datachain.error import InvalidDatasetNameError
from datachain.sql.types import (
Expand Down Expand Up @@ -180,3 +181,34 @@ def test_parse_dataset_name(full_name, namespace, project, name):
def test_parse_dataset_name_empty_name():
with pytest.raises(InvalidDatasetNameError):
assert parse_dataset_name(None)


@pytest.mark.parametrize(
"uri,namespace,project,name,version",
[
("ds://global.dev.result", "global", "dev", "result", None),
("ds://[email protected]", "global", "dev", "result", "1.0.5"),
("ds://@ilongin.dev.result", "@ilongin", "dev", "result", None),
("ds://@[email protected]", "@ilongin", "dev", "result", "1.0.4"),
("ds://@vlad.dev.result", "@vlad", "dev", "result", None),
("ds://@[email protected]", "@vlad", "dev", "result", "1.0.5"),
],
)
def test_parse_dataset_uri(uri, namespace, project, name, version):
assert parse_dataset_uri(uri) == (namespace, project, name, version)


@pytest.mark.parametrize(
"uri",
[
"ds://result",
"ds://[email protected]",
"ds://@[email protected]",
"ds://@[email protected]",
],
)
def test_parse_dataset_uri_invalid_format(uri):
with pytest.raises(ValueError) as excinfo:
parse_dataset_uri(uri)

assert str(excinfo.value) == f"Invalid dataset URI format: {uri}"
Loading