-
Notifications
You must be signed in to change notification settings - Fork 132
Fixing and refactoring parse_dataset_uri()
#1352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,58 +43,34 @@ | |
| StorageURI = NewType("StorageURI", str) | ||
|
|
||
|
|
||
| def parse_dataset_uri(uri: str) -> tuple[str, str, str, str | None]: | ||
| def parse_dataset_uri( | ||
| uri: str, | ||
| ) -> tuple[Optional[str], Optional[str], str, Optional[str]]: | ||
| """ | ||
| Parse a dataset URI of the form: | ||
| ds://<namespace>.<project>.<name>[@v<semver>] | ||
| Components: | ||
| - `ds://` : required prefix identifying dataset URIs. | ||
| - `namespace` : required namespace, may start with '@' (e.g., "@user"). | ||
| - `project` : required project name inside the namespace. | ||
| - `name` : required dataset name. | ||
| - `@v<semver>` : optional version suffix. Must start with '@v' and | ||
| be a semantic version string MAJOR.MINOR.PATCH | ||
| (e.g., "1.0.4"). | ||
| ds://[<namespace>.][<project>.]<name>[@v<semver>] | ||
| Returns: | ||
| tuple[str, str, str, str | None]: | ||
| (namespace, project, name, version) where version is None | ||
| if not provided. | ||
| Raises: | ||
| ValueError: if the URI does not start with 'ds://' or does not | ||
| match the expected format. | ||
| (namespace, project, name, version) | ||
| """ | ||
|
|
||
| if not uri.startswith("ds://"): | ||
| raise ValueError(f"Invalid dataset URI: {uri}") | ||
|
|
||
| body = uri[len("ds://") :] | ||
|
|
||
| pattern = re.compile( | ||
| r""" | ||
| ^(?P<namespace>@?\w+) # namespace, may start with '@' | ||
| \. (?P<project>\w+) # project | ||
| \. (?P<name>\w+) # dataset name | ||
| (?:@v # optional version prefix must be '@v' | ||
| (?P<version>\d+\.\d+\.\d+) | ||
| )?$ # end of string | ||
| """, | ||
| re.VERBOSE, | ||
| ) | ||
|
|
||
| match = pattern.match(body) | ||
| # Split off optional @v<version> | ||
| match = re.match(r"^(?P<name>.+?)(?:@v(?P<version>\d+\.\d+\.\d+))?$", body) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We still have quite a lot of cases where this regexp will not works (and we have no tests for all these cases), I would prefer to move parsing dataset name, version, namespace, project, etc to separate module and cover it all with tests, so we can reuse it all over the code, but same time it looks like a separate task to refactor this part and it is good enough for now for me. |
||
| if not match: | ||
| raise ValueError(f"Invalid dataset URI format: {uri}") | ||
|
|
||
| return ( | ||
| match.group("namespace"), | ||
| match.group("project"), | ||
| match.group("name"), | ||
| match.group("version"), | ||
| ) | ||
| dataset_name = match.group("name") | ||
| version = match.group("version") | ||
|
|
||
| namespace, project, name = parse_dataset_name(dataset_name) | ||
|
|
||
| return namespace, project, name, version | ||
|
|
||
|
|
||
| def create_dataset_uri( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -186,29 +186,19 @@ def test_parse_dataset_name_empty_name(): | |
| @pytest.mark.parametrize( | ||
| "uri,namespace,project,name,version", | ||
| [ | ||
| ("ds://result", None, None, "result", None), | ||
| ("ds://[email protected]", None, None, "result", "1.0.5"), | ||
| ("ds://dev.result", None, "dev", "result", None), | ||
| ("ds://[email protected]", None, "dev", "result", "1.0.5"), | ||
| ("ds://global.dev.result", "global", "dev", "result", None), | ||
| ("ds://[email protected]", "global", "dev", "result", "1.0.5"), | ||
| ("ds://@ilongin.dev.result", "@ilongin", "dev", "result", None), | ||
| ("ds://@[email protected]", "@ilongin", "dev", "result", "1.0.4"), | ||
| ("ds://@vlad.dev.result", "@vlad", "dev", "result", None), | ||
| ("ds://@[email protected]", "@vlad", "dev", "result", "1.0.5"), | ||
| ("ds://@[email protected]@v1.0.5", "@vlad", "@vlad", "result", "1.0.5"), | ||
| ("ds://@vlad.@vlad.@[email protected]", "@vlad", "@vlad", "@vlad", "1.0.5"), | ||
| ], | ||
| ) | ||
| def test_parse_dataset_uri(uri, namespace, project, name, version): | ||
| assert parse_dataset_uri(uri) == (namespace, project, name, version) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "uri", | ||
| [ | ||
| "ds://result", | ||
| "ds://[email protected]", | ||
| "ds://@[email protected]", | ||
| "ds://@[email protected]", | ||
| ], | ||
| ) | ||
| def test_parse_dataset_uri_invalid_format(uri): | ||
| with pytest.raises(ValueError) as excinfo: | ||
| parse_dataset_uri(uri) | ||
|
|
||
| assert str(excinfo.value) == f"Invalid dataset URI format: {uri}" | ||
|
||
Uh oh!
There was an error while loading. Please reload this page.