-
Notifications
You must be signed in to change notification settings - Fork 132
Fixing and refactoring parse_dataset_uri()
#1352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import builtins | ||
| import json | ||
| import re | ||
| from dataclasses import dataclass, fields | ||
| from datetime import datetime | ||
| from functools import cached_property | ||
|
|
@@ -10,7 +11,6 @@ | |
| TypeVar, | ||
| Union, | ||
| ) | ||
| from urllib.parse import urlparse | ||
|
|
||
| from packaging.specifiers import SpecifierSet | ||
| from packaging.version import Version | ||
|
|
@@ -43,25 +43,58 @@ | |
| StorageURI = NewType("StorageURI", str) | ||
|
|
||
|
|
||
| def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]: | ||
| def parse_dataset_uri(uri: str) -> tuple[str, str, str, str | None]: | ||
| """ | ||
| Parse dataser uri to extract name and version out of it (if version is defined) | ||
| Example: | ||
| Input: ds://[email protected] | ||
| Output: (zalando, 3.0.1) | ||
| Parse a dataset URI of the form: | ||
sourcery-ai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ds://<namespace>.<project>.<name>[@v<semver>] | ||
|
||
| Components: | ||
| - `ds://` : required prefix identifying dataset URIs. | ||
|
||
| - `namespace` : required namespace, may start with '@' (e.g., "@user"). | ||
| - `project` : required project name inside the namespace. | ||
| - `name` : required dataset name. | ||
| - `@v<semver>` : optional version suffix. Must start with '@v' and | ||
| be a semantic version string MAJOR.MINOR.PATCH | ||
| (e.g., "1.0.4"). | ||
| Returns: | ||
| tuple[str, str, str, str | None]: | ||
| (namespace, project, name, version) where version is None | ||
| if not provided. | ||
| Raises: | ||
| ValueError: if the URI does not start with 'ds://' or does not | ||
| match the expected format. | ||
| """ | ||
| p = urlparse(uri) | ||
| if p.scheme != "ds": | ||
| raise Exception("Dataset uri should start with ds://") | ||
| s = p.netloc.split("@v") | ||
| name = s[0] | ||
| if len(s) == 1: | ||
| return name, None | ||
| if len(s) != 2: | ||
| raise Exception( | ||
| "Wrong dataset uri format, it should be: ds://<name>@v<version>" | ||
| ) | ||
| return name, s[1] | ||
|
|
||
| if not uri.startswith("ds://"): | ||
| raise ValueError(f"Invalid dataset URI: {uri}") | ||
|
|
||
| body = uri[len("ds://") :] | ||
|
|
||
| pattern = re.compile( | ||
| r""" | ||
| ^(?P<namespace>@?\w+) # namespace, may start with '@' | ||
| \. (?P<project>\w+) # project | ||
|
||
| \. (?P<name>\w+) # dataset name | ||
|
||
| (?:@v # optional version prefix must be '@v' | ||
| (?P<version>\d+\.\d+\.\d+) | ||
|
||
| )?$ # end of string | ||
| """, | ||
| re.VERBOSE, | ||
| ) | ||
|
||
|
|
||
| match = pattern.match(body) | ||
| if not match: | ||
| raise ValueError(f"Invalid dataset URI format: {uri}") | ||
|
|
||
| return ( | ||
| match.group("namespace"), | ||
| match.group("project"), | ||
| match.group("name"), | ||
| match.group("version"), | ||
| ) | ||
|
|
||
|
|
||
| def create_dataset_uri( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
| DatasetRecord, | ||
| DatasetVersion, | ||
| parse_dataset_name, | ||
| parse_dataset_uri, | ||
| ) | ||
| from datachain.error import InvalidDatasetNameError | ||
| from datachain.sql.types import ( | ||
|
|
@@ -180,3 +181,34 @@ def test_parse_dataset_name(full_name, namespace, project, name): | |
| def test_parse_dataset_name_empty_name(): | ||
| with pytest.raises(InvalidDatasetNameError): | ||
| assert parse_dataset_name(None) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "uri,namespace,project,name,version", | ||
| [ | ||
| ("ds://global.dev.result", "global", "dev", "result", None), | ||
| ("ds://[email protected]", "global", "dev", "result", "1.0.5"), | ||
| ("ds://@ilongin.dev.result", "@ilongin", "dev", "result", None), | ||
| ("ds://@[email protected]", "@ilongin", "dev", "result", "1.0.4"), | ||
| ("ds://@vlad.dev.result", "@vlad", "dev", "result", None), | ||
| ("ds://@[email protected]", "@vlad", "dev", "result", "1.0.5"), | ||
| ], | ||
| ) | ||
| def test_parse_dataset_uri(uri, namespace, project, name, version): | ||
| assert parse_dataset_uri(uri) == (namespace, project, name, version) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "uri", | ||
| [ | ||
| "ds://result", | ||
| "ds://[email protected]", | ||
| "ds://@[email protected]", | ||
| "ds://@[email protected]", | ||
| ], | ||
| ) | ||
| def test_parse_dataset_uri_invalid_format(uri): | ||
| with pytest.raises(ValueError) as excinfo: | ||
| parse_dataset_uri(uri) | ||
|
|
||
| assert str(excinfo.value) == f"Invalid dataset URI format: {uri}" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggestion: Exception handling could be more specific than a generic Exception.
Catching Exception may hide unrelated errors. Catch ValueError or a custom exception from parse_dataset_uri instead.