Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 13 additions & 37 deletions src/datachain/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,58 +43,34 @@
StorageURI = NewType("StorageURI", str)


def parse_dataset_uri(uri: str) -> tuple[str, str, str, str | None]:
def parse_dataset_uri(
uri: str,
) -> tuple[Optional[str], Optional[str], str, Optional[str]]:
"""
Parse a dataset URI of the form:
ds://<namespace>.<project>.<name>[@v<semver>]
Components:
- `ds://` : required prefix identifying dataset URIs.
- `namespace` : required namespace, may start with '@' (e.g., "@user").
- `project` : required project name inside the namespace.
- `name` : required dataset name.
- `@v<semver>` : optional version suffix. Must start with '@v' and
be a semantic version string MAJOR.MINOR.PATCH
(e.g., "1.0.4").
ds://[<namespace>.][<project>.]<name>[@v<semver>]
Returns:
tuple[str, str, str, str | None]:
(namespace, project, name, version) where version is None
if not provided.
Raises:
ValueError: if the URI does not start with 'ds://' or does not
match the expected format.
(namespace, project, name, version)
"""

if not uri.startswith("ds://"):
raise ValueError(f"Invalid dataset URI: {uri}")

body = uri[len("ds://") :]

pattern = re.compile(
r"""
^(?P<namespace>@?\w+) # namespace, may start with '@'
\. (?P<project>\w+) # project
\. (?P<name>\w+) # dataset name
(?:@v # optional version prefix must be '@v'
(?P<version>\d+\.\d+\.\d+)
)?$ # end of string
""",
re.VERBOSE,
)

match = pattern.match(body)
# Split off optional @v<version>
match = re.match(r"^(?P<name>.+?)(?:@v(?P<version>\d+\.\d+\.\d+))?$", body)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We still have quite a lot of cases where this regexp will not works (and we have no tests for all these cases), I would prefer to move parsing dataset name, version, namespace, project, etc to separate module and cover it all with tests, so we can reuse it all over the code, but same time it looks like a separate task to refactor this part and it is good enough for now for me.

if not match:
raise ValueError(f"Invalid dataset URI format: {uri}")

return (
match.group("namespace"),
match.group("project"),
match.group("name"),
match.group("version"),
)
dataset_name = match.group("name")
version = match.group("version")

namespace, project, name = parse_dataset_name(dataset_name)

return namespace, project, name, version


def create_dataset_uri(
Expand Down
22 changes: 6 additions & 16 deletions tests/unit/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,29 +186,19 @@ def test_parse_dataset_name_empty_name():
@pytest.mark.parametrize(
"uri,namespace,project,name,version",
[
("ds://result", None, None, "result", None),
("ds://[email protected]", None, None, "result", "1.0.5"),
("ds://dev.result", None, "dev", "result", None),
("ds://[email protected]", None, "dev", "result", "1.0.5"),
("ds://global.dev.result", "global", "dev", "result", None),
("ds://[email protected]", "global", "dev", "result", "1.0.5"),
("ds://@ilongin.dev.result", "@ilongin", "dev", "result", None),
("ds://@[email protected]", "@ilongin", "dev", "result", "1.0.4"),
("ds://@vlad.dev.result", "@vlad", "dev", "result", None),
("ds://@[email protected]", "@vlad", "dev", "result", "1.0.5"),
("ds://@[email protected]@v1.0.5", "@vlad", "@vlad", "result", "1.0.5"),
("ds://@vlad.@vlad.@[email protected]", "@vlad", "@vlad", "@vlad", "1.0.5"),
],
)
def test_parse_dataset_uri(uri, namespace, project, name, version):
assert parse_dataset_uri(uri) == (namespace, project, name, version)


@pytest.mark.parametrize(
"uri",
[
"ds://result",
"ds://[email protected]",
"ds://@[email protected]",
"ds://@[email protected]",
],
)
def test_parse_dataset_uri_invalid_format(uri):
with pytest.raises(ValueError) as excinfo:
parse_dataset_uri(uri)

assert str(excinfo.value) == f"Invalid dataset URI format: {uri}"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why did we remove these tests to catch all the failures and left only "happy path" tests :(

Loading