-
Notifications
You must be signed in to change notification settings - Fork 132
Fixing and refactoring parse_dataset_uri()
#1352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import builtins | ||
| import json | ||
| import re | ||
| from dataclasses import dataclass, fields | ||
| from datetime import datetime | ||
| from functools import cached_property | ||
|
|
@@ -10,7 +11,6 @@ | |
| TypeVar, | ||
| Union, | ||
| ) | ||
| from urllib.parse import urlparse | ||
|
|
||
| from packaging.specifiers import SpecifierSet | ||
| from packaging.version import Version | ||
|
|
@@ -43,25 +43,34 @@ | |
| StorageURI = NewType("StorageURI", str) | ||
|
|
||
|
|
||
| def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]: | ||
| def parse_dataset_uri( | ||
| uri: str, | ||
| ) -> tuple[Optional[str], Optional[str], str, Optional[str]]: | ||
| """ | ||
| Parse dataser uri to extract name and version out of it (if version is defined) | ||
| Example: | ||
| Input: ds://[email protected] | ||
| Output: (zalando, 3.0.1) | ||
| Parse a dataset URI of the form: | ||
sourcery-ai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| ds://[<namespace>.][<project>.]<name>[@v<semver>] | ||
|
|
||
| Returns: | ||
| (namespace, project, name, version) | ||
| """ | ||
| p = urlparse(uri) | ||
| if p.scheme != "ds": | ||
| raise Exception("Dataset uri should start with ds://") | ||
| s = p.netloc.split("@v") | ||
| name = s[0] | ||
| if len(s) == 1: | ||
| return name, None | ||
| if len(s) != 2: | ||
| raise Exception( | ||
| "Wrong dataset uri format, it should be: ds://<name>@v<version>" | ||
| ) | ||
| return name, s[1] | ||
|
|
||
| if not uri.startswith("ds://"): | ||
| raise ValueError(f"Invalid dataset URI: {uri}") | ||
|
|
||
| body = uri[len("ds://") :] | ||
|
|
||
| # Split off optional @v<version> | ||
| match = re.match(r"^(?P<name>.+?)(?:@v(?P<version>\d+\.\d+\.\d+))?$", body) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We still have quite a lot of cases where this regexp will not works (and we have no tests for all these cases), I would prefer to move parsing dataset name, version, namespace, project, etc to separate module and cover it all with tests, so we can reuse it all over the code, but same time it looks like a separate task to refactor this part and it is good enough for now for me. |
||
| if not match: | ||
| raise ValueError(f"Invalid dataset URI format: {uri}") | ||
|
|
||
| dataset_name = match.group("name") | ||
| version = match.group("version") | ||
|
|
||
| namespace, project, name = parse_dataset_name(dataset_name) | ||
|
|
||
| return namespace, project, name, version | ||
|
|
||
|
|
||
| def create_dataset_uri( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
| DatasetRecord, | ||
| DatasetVersion, | ||
| parse_dataset_name, | ||
| parse_dataset_uri, | ||
| ) | ||
| from datachain.error import InvalidDatasetNameError | ||
| from datachain.sql.types import ( | ||
|
|
@@ -180,3 +181,24 @@ def test_parse_dataset_name(full_name, namespace, project, name): | |
| def test_parse_dataset_name_empty_name(): | ||
| with pytest.raises(InvalidDatasetNameError): | ||
| assert parse_dataset_name(None) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "uri,namespace,project,name,version", | ||
| [ | ||
| ("ds://result", None, None, "result", None), | ||
| ("ds://[email protected]", None, None, "result", "1.0.5"), | ||
| ("ds://dev.result", None, "dev", "result", None), | ||
| ("ds://[email protected]", None, "dev", "result", "1.0.5"), | ||
| ("ds://global.dev.result", "global", "dev", "result", None), | ||
| ("ds://[email protected]", "global", "dev", "result", "1.0.5"), | ||
| ("ds://@ilongin.dev.result", "@ilongin", "dev", "result", None), | ||
| ("ds://@[email protected]", "@ilongin", "dev", "result", "1.0.4"), | ||
| ("ds://@vlad.dev.result", "@vlad", "dev", "result", None), | ||
| ("ds://@[email protected]", "@vlad", "dev", "result", "1.0.5"), | ||
| ("ds://@[email protected]@v1.0.5", "@vlad", "@vlad", "result", "1.0.5"), | ||
| ("ds://@vlad.@vlad.@[email protected]", "@vlad", "@vlad", "@vlad", "1.0.5"), | ||
| ], | ||
| ) | ||
| def test_parse_dataset_uri(uri, namespace, project, name, version): | ||
| assert parse_dataset_uri(uri) == (namespace, project, name, version) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggestion: Exception handling could be more specific than a generic Exception.
Catching Exception may hide unrelated errors. Catch ValueError or a custom exception from parse_dataset_uri instead.