Skip to content

Commit 19142c6

Browse files
fix(service): normalize git url to avoid duplicate cache entries (#3606)
1 parent e0ff587 commit 19142c6

File tree

14 files changed

+127
-44
lines changed

14 files changed

+127
-44
lines changed

renku/core/util/contexts.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from renku.core import errors
2727
from renku.core.interface.database_gateway import IDatabaseGateway
2828
from renku.core.interface.project_gateway import IProjectGateway
29+
from renku.ui.service.utils import normalize_git_url
2930

3031

3132
@contextlib.contextmanager
@@ -114,6 +115,8 @@ def renku_project_context(path, check_git_path=True):
114115
if check_git_path:
115116
path = get_git_path(path)
116117

118+
path = normalize_git_url(str(path))
119+
117120
with project_context.with_path(path=path):
118121
project_context.external_storage_requested = True
119122
yield project_context.path

renku/domain_model/git.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
from renku.core import errors
2727
from renku.core.util.os import is_ascii, normalize_to_ascii
28+
from renku.ui.service.utils import normalize_git_url
2829

2930
_RE_SCHEME = r"(?P<scheme>(git\+)?(https?|git|ssh|rsync))\://"
3031

@@ -70,13 +71,6 @@ def _build(*parts):
7071
]
7172

7273

73-
def filter_repo_name(repo_name: str) -> str:
74-
"""Remove the .git extension from the repo name."""
75-
if repo_name is not None and repo_name.endswith(".git"):
76-
return repo_name[: -len(".git")]
77-
return repo_name
78-
79-
8074
@attr.s()
8175
class GitURL:
8276
"""Parser for common Git URLs."""
@@ -90,14 +84,14 @@ class GitURL:
9084
password = attr.ib(default=None)
9185
port = attr.ib(default=None)
9286
owner = attr.ib(default=None)
93-
name: Optional[str] = attr.ib(default=None, converter=filter_repo_name)
87+
name: Optional[str] = attr.ib(default=None, converter=normalize_git_url)
9488
slug = attr.ib(default=None)
9589
_regex = attr.ib(default=None, eq=False, order=False)
9690

9791
def __attrs_post_init__(self):
9892
"""Derive basic information."""
9993
if not self.name and self.path:
100-
self.name = filter_repo_name(Path(self.path).name)
94+
self.name = normalize_git_url(Path(self.path).name)
10195

10296
self.slug = normalize_to_ascii(self.name)
10397

renku/ui/service/cache/models/project.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +14,7 @@
1514
# See the License for the specific language governing permissions and
1615
# limitations under the License.
1716
"""Renku service cache project related models."""
17+
1818
import os
1919
import shutil
2020
from datetime import datetime
@@ -26,6 +26,7 @@
2626

2727
from renku.ui.service.cache.base import BaseCache
2828
from renku.ui.service.config import CACHE_PROJECTS_PATH
29+
from renku.ui.service.utils import normalize_git_url
2930

3031
MAX_CONCURRENT_PROJECT_REQUESTS = 10
3132
LOCK_TIMEOUT = 15
@@ -61,7 +62,7 @@ def abs_path(self) -> Path:
6162
branch = self.branch
6263
if not self.branch:
6364
branch = NO_BRANCH_FOLDER
64-
return CACHE_PROJECTS_PATH / self.user_id / self.owner / self.slug / branch
65+
return CACHE_PROJECTS_PATH / self.user_id / self.owner / normalize_git_url(self.slug) / branch
6566

6667
def read_lock(self, timeout: Optional[float] = None):
6768
"""Shared read lock on the project."""

renku/ui/service/cache/projects.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class ProjectManagementCache(BaseCache):
3131

3232
project_schema = ProjectSchema()
3333

34-
def make_project(self, user, project_data, persist=True):
34+
def make_project(self, user, project_data, persist=True) -> Project:
3535
"""Store user project metadata."""
3636
project_data.update({"user_id": user.user_id})
3737

renku/ui/service/cache/serializers/project.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,6 +21,7 @@
2221

2322
from renku.ui.service.cache.models.project import Project
2423
from renku.ui.service.serializers.common import AccessSchema, CreationSchema, MandatoryUserSchema
24+
from renku.ui.service.utils import normalize_git_url
2525

2626

2727
class ProjectSchema(CreationSchema, AccessSchema, MandatoryUserSchema):
@@ -43,4 +43,7 @@ class ProjectSchema(CreationSchema, AccessSchema, MandatoryUserSchema):
4343
@post_load
4444
def make_project(self, data, **options):
4545
"""Construct project object."""
46+
data["git_url"] = normalize_git_url(data["git_url"])
47+
data["name"] = normalize_git_url(data["name"])
48+
data["slug"] = normalize_git_url(data["slug"])
4649
return Project(**data)

renku/ui/service/controllers/api/mixins.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from abc import ABCMeta, abstractmethod
1919
from functools import wraps
2020
from pathlib import Path
21+
from typing import Optional, Union
2122

2223
import portalocker
2324

@@ -41,6 +42,7 @@
4142
from renku.ui.service.jobs.contexts import enqueue_retry
4243
from renku.ui.service.jobs.delayed_ctrl import delayed_ctrl_job
4344
from renku.ui.service.serializers.common import DelayedResponseRPC
45+
from renku.ui.service.utils import normalize_git_url
4446

4547
PROJECT_FETCH_TIME = 30
4648

@@ -96,15 +98,30 @@ def __init__(
9698
self.migrate_project = self.request_data.get("migrate_project", False)
9799

98100
# NOTE: This is absolute project path and its set before invocation of `renku_op`,
99-
# so its safe to use it in controller operations. Its type will always be `pathlib.Path`.
100-
self.project_path = None
101+
# so it's safe to use it in controller operations. Its type will always be `pathlib.Path`.
102+
self._project_path = None
101103

102104
@property
103105
@abstractmethod
104106
def context(self):
105107
"""Operation context."""
106108
raise NotImplementedError
107109

110+
@property
111+
def project_path(self) -> Optional[Path]:
112+
"""Absolute project's path."""
113+
return self._project_path
114+
115+
@project_path.setter
116+
def project_path(self, path: Optional[Union[str, Path]]):
117+
"""Set absolute project's path."""
118+
if not path:
119+
self._project_path = None
120+
return
121+
122+
path = normalize_git_url(str(path))
123+
self._project_path = Path(path)
124+
108125
@abstractmethod
109126
def renku_op(self):
110127
"""Implements operation for the controller."""
@@ -138,7 +155,7 @@ def execute_op(self):
138155

139156
if self.context.get("is_delayed", False) and "user_id" in self.user_data:
140157
# NOTE: After pushing the controller to delayed execution,
141-
# its important to remove the delayed mark,
158+
# it's important to remove the delayed mark,
142159
# otherwise job will keep recursively enqueuing itself.
143160
self.context.pop("is_delayed")
144161

@@ -149,13 +166,12 @@ def execute_op(self):
149166

150167
return job
151168

152-
if "git_url" in self.context and "user_id" not in self.user_data:
153-
# NOTE: Anonymous session support.
154-
return self.remote()
155-
156-
elif "git_url" in self.context and "user_id" in self.user_data:
157-
return self.local()
158-
169+
if "git_url" in self.context:
170+
if "user_id" not in self.user_data:
171+
# NOTE: Anonymous session support.
172+
return self.remote()
173+
else:
174+
return self.local()
159175
else:
160176
raise RenkuException("context does not contain `project_id` or `git_url`")
161177

renku/ui/service/controllers/project_lock_status.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2021 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,6 +23,7 @@
2423
from renku.ui.service.controllers.api.mixins import RenkuOperationMixin
2524
from renku.ui.service.errors import IntermittentProjectIdError
2625
from renku.ui.service.serializers.project import ProjectLockStatusRequest, ProjectLockStatusResponseRPC
26+
from renku.ui.service.utils import normalize_git_url
2727
from renku.ui.service.views import result_response
2828

2929

@@ -39,6 +39,9 @@ def __init__(self, cache, user_data, request_data):
3939

4040
super().__init__(cache, user_data, request_data)
4141

42+
if "git_url" in self.ctx:
43+
self.ctx["git_url"] = normalize_git_url(self.ctx["git_url"])
44+
4245
@property
4346
def context(self):
4447
"""Controller operation context."""

renku/ui/service/controllers/utils/remote_project.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,6 +25,7 @@
2625
from renku.core.util.contexts import renku_project_context
2726
from renku.infrastructure.repository import Repository
2827
from renku.ui.service.serializers.cache import ProjectCloneContext
28+
from renku.ui.service.utils import normalize_git_url
2929

3030
ANONYMOUS_SESSION = "anonymous"
3131

@@ -44,7 +44,7 @@ def __init__(self, user_data, request_data):
4444

4545
self.ctx = ProjectCloneContext().load({**user_data, **request_data}, unknown=EXCLUDE)
4646

47-
self.git_url = self.ctx["url_with_auth"]
47+
self.git_url = normalize_git_url(self.ctx["url_with_auth"])
4848
self.branch = self.ctx["branch"]
4949

5050
@property

renku/ui/service/gateways/gitlab_api_provider.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class GitlabAPIProvider(IGitAPIProvider):
3737
target_folder: Folder to use to download the files.
3838
remote: Remote repository URL.
3939
token: User bearer token.
40-
ref: optional reference to checkout,
40+
ref: optional reference to check out,
4141
Raises:
4242
errors.ProjectNotFound: If the remote URL is not accessible.
4343
errors.AuthenticationError: If the bearer token is invalid in any way.

renku/ui/service/gateways/repository_cache.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from renku.ui.service.errors import IntermittentCacheError, IntermittentLockError
3939
from renku.ui.service.interfaces.repository_cache import IRepositoryCache
4040
from renku.ui.service.logger import service_log
41+
from renku.ui.service.utils import normalize_git_url
4142

4243

4344
class LocalRepositoryCache(IRepositoryCache):
@@ -50,6 +51,7 @@ def get(
5051
if git_url is None:
5152
raise ValidationError("Invalid `git_url`, URL is empty", "git_url")
5253

54+
git_url = normalize_git_url(git_url)
5355
try:
5456
project = Project.get(
5557
(Project.user_id == user.user_id) & (Project.git_url == git_url) & (Project.branch == branch)
@@ -101,6 +103,8 @@ def _clone_project(
101103
self, cache: ServiceCache, git_url: str, branch: Optional[str], user: User, shallow: bool = True
102104
) -> Project:
103105
"""Clone a project to cache."""
106+
git_url = normalize_git_url(git_url)
107+
104108
try:
105109
parsed_git_url = GitURL.parse(git_url)
106110
except UnicodeError as e:
@@ -228,7 +232,7 @@ def _maybe_update_cache(self, project: Project, user: User):
228232

229233
def git_url_with_auth(project: Project, user: User):
230234
"""Format url with auth."""
231-
git_url = urlparse(project.git_url)
235+
git_url = urlparse(normalize_git_url(project.git_url))
232236

233237
url = "oauth2:{}@{}".format(user.token, git_url.netloc)
234238
return git_url._replace(netloc=url).geturl()

0 commit comments

Comments
 (0)