Skip to content

Commit 47fa194

Browse files
feat(dataset): support for azure blob storage (#3257)
1 parent bd063cc commit 47fa194

File tree

17 files changed

+675
-198
lines changed

17 files changed

+675
-198
lines changed

.github/workflows/test_deploy.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,9 @@ jobs:
754754
ZENODO_ACCESS_TOKEN: ${{ secrets.ZENODO_ACCESS_TOKEN }}
755755
OLOS_ACCESS_TOKEN: ${{ secrets.OLOS_ACCESS_TOKEN }}
756756
RENKU_REQUESTS_TIMEOUT_SECONDS: 120
757+
CLOUD_STORAGE_AZURE_KEY: ${{ secrets.CLOUD_STORAGE_AZURE_KEY }}
758+
CLOUD_STORAGE_S3_ACCESS_KEY_ID: ${{ secrets.CLOUD_STORAGE_S3_ACCESS_KEY_ID }}
759+
CLOUD_STORAGE_S3_SECRET_ACCESS_KEY: ${{ secrets.CLOUD_STORAGE_S3_SECRET_ACCESS_KEY }}
757760
run: pytest -m "integration and not service and not serial" -v --timeout=600 -n auto
758761
- name: Start Redis
759762
uses: supercharge/[email protected]
@@ -912,6 +915,9 @@ jobs:
912915
ZENODO_ACCESS_TOKEN: ${{ secrets.ZENODO_ACCESS_TOKEN }}
913916
OLOS_ACCESS_TOKEN: ${{ secrets.OLOS_ACCESS_TOKEN }}
914917
RENKU_REQUESTS_TIMEOUT_SECONDS: 120
918+
CLOUD_STORAGE_AZURE_KEY: ${{ secrets.CLOUD_STORAGE_AZURE_KEY }}
919+
CLOUD_STORAGE_S3_ACCESS_KEY_ID: ${{ secrets.CLOUD_STORAGE_S3_ACCESS_KEY_ID }}
920+
CLOUD_STORAGE_S3_SECRET_ACCESS_KEY: ${{ secrets.CLOUD_STORAGE_S3_SECRET_ACCESS_KEY }}
915921
run: pytest -m "integration and not serial" -v
916922
- name: Start Redis
917923
uses: supercharge/[email protected]

renku/command/dataset.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,8 @@ def mount_external_storage_command(unmount: bool):
138138
"""Command for mounting an external storage."""
139139
command = unmount_external_storage if unmount else mount_external_storage
140140
return Command().command(command).lock_dataset().with_database(write=False).require_migration()
141+
142+
143+
def unmount_external_storage_command():
144+
"""Command for unmounting an external storage."""
145+
return Command().command(unmount_external_storage).lock_dataset().with_database(write=False).require_migration()

renku/core/dataset/dataset.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
from renku.core.dataset.pointer_file import create_external_file, is_external_file_updated, update_external_file
3535
from renku.core.dataset.providers.factory import ProviderFactory
3636
from renku.core.dataset.providers.models import ProviderDataset
37-
from renku.core.dataset.providers.s3 import S3Credentials
3837
from renku.core.dataset.request_model import ImageRequestModel
3938
from renku.core.dataset.tag import get_dataset_by_tag, prompt_access_token, prompt_tag_selection
4039
from renku.core.interface.dataset_gateway import IDatasetGateway
@@ -882,10 +881,8 @@ def update_dataset_custom_metadata(
882881
if custom_metadata is not None and custom_metadata_source is not None:
883882
if isinstance(custom_metadata, dict):
884883
custom_metadata = [custom_metadata]
885-
for icustom_metadata in custom_metadata:
886-
existing_metadata.append(
887-
Annotation(id=Annotation.generate_id(), body=icustom_metadata, source=custom_metadata_source)
888-
)
884+
for cm in custom_metadata:
885+
existing_metadata.append(Annotation(id=Annotation.generate_id(), body=cm, source=custom_metadata_source))
889886

890887
dataset.annotations = existing_metadata
891888

@@ -1346,7 +1343,7 @@ def mount_external_storage(name: str, existing: Optional[Path], yes: bool) -> No
13461343
datadir.mkdir(parents=True, exist_ok=True)
13471344

13481345
provider = ProviderFactory.get_mount_provider(uri=dataset.storage)
1349-
credentials = S3Credentials(provider)
1346+
credentials = provider.get_credentials()
13501347
prompt_for_credentials(credentials)
13511348
storage = provider.get_storage(credentials=credentials)
13521349

renku/core/dataset/dataset_add.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -322,23 +322,33 @@ def move_file(file: DatasetAddMetadata, storage: Optional[IStorage]):
322322
else:
323323
file.action = DatasetAddAction.DOWNLOAD
324324

325-
if file.action == DatasetAddAction.COPY:
326-
shutil.copy(file.source, file.destination)
327-
elif file.action == DatasetAddAction.MOVE:
328-
shutil.move(file.source, file.destination, copy_function=shutil.copy) # type: ignore
329-
elif file.action == DatasetAddAction.SYMLINK:
330-
create_external_file(target=file.source, path=file.destination)
331-
# NOTE: Don't track symlinks to external files in LFS
332-
track_in_lfs = False
333-
elif file.action == DatasetAddAction.DOWNLOAD:
334-
assert file.provider, f"Storage provider isn't set for {file} with DOWNLOAD action"
335-
storage = file.provider.get_storage()
336-
storage.download(file.url, file.destination)
337-
elif file.metadata_only:
338-
# NOTE: Nothing to do when adding file to a dataset with a parent remote storage
339-
pass
340-
else:
341-
raise errors.OperationError(f"Invalid action {file.action}")
325+
file_to_upload = file.source.resolve()
326+
327+
try:
328+
if file.action == DatasetAddAction.COPY:
329+
shutil.copy(file.source, file.destination)
330+
elif file.action == DatasetAddAction.MOVE:
331+
shutil.move(file.source, file.destination, copy_function=shutil.copy) # type: ignore
332+
elif file.action == DatasetAddAction.SYMLINK:
333+
create_external_file(target=file.source, path=file.destination)
334+
# NOTE: Don't track symlinks to external files in LFS
335+
track_in_lfs = False
336+
elif file.action == DatasetAddAction.DOWNLOAD:
337+
assert file.provider, f"Storage provider isn't set for {file} with DOWNLOAD action"
338+
download_storage = file.provider.get_storage()
339+
download_storage.download(file.url, file.destination)
340+
file_to_upload = file.destination
341+
elif file.metadata_only:
342+
# NOTE: Nothing to do when adding file to a dataset with a parent remote storage
343+
pass
344+
else:
345+
raise errors.OperationError(f"Invalid action {file.action}")
346+
except OSError as e:
347+
# NOTE: It's ok if copying data to a read-only mounted cloud storage fails
348+
if "Read-only file system" in str(e) and storage:
349+
pass
350+
else:
351+
raise
342352

343353
if track_in_lfs and not dataset.storage:
344354
track_paths_in_storage(file.destination)
@@ -352,8 +362,8 @@ def move_file(file: DatasetAddMetadata, storage: Optional[IStorage]):
352362
md5_hash = file.based_on.checksum
353363
else:
354364
file_uri = get_upload_uri(dataset=dataset, entity_path=file.entity_path)
355-
storage.upload(source=file.destination, uri=file_uri)
356-
md5_hash = hash_file(file.destination, hash_type="md5") or ""
365+
storage.upload(source=file_to_upload, uri=file_uri)
366+
md5_hash = hash_file(file_to_upload, hash_type="md5") or ""
357367

358368
file.based_on = RemoteEntity(url=file_uri, path=file.entity_path, checksum=md5_hash)
359369

renku/core/dataset/providers/api.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,11 @@ def get_importer(self, **kwargs) -> "ImporterApi":
132132
class StorageProviderInterface(abc.ABC):
133133
"""Interface defining backend storage providers."""
134134

135+
@abc.abstractmethod
136+
def get_credentials(self) -> "ProviderCredentials":
137+
"""Return an instance of provider's credential class."""
138+
raise NotImplementedError
139+
135140
@abc.abstractmethod
136141
def get_storage(self, credentials: Optional["ProviderCredentials"] = None) -> "IStorage":
137142
"""Return the storage manager for the provider."""
@@ -285,10 +290,18 @@ def get_canonical_credentials_names(self) -> Tuple[str, ...]:
285290

286291
return tuple(get_canonical_key(key) for key in self.get_credentials_names())
287292

293+
def get_canonical_credentials_names_with_no_value(self) -> Tuple[str, ...]:
294+
"""Return canonical credentials names that can be used as config keys for keys with no valid value."""
295+
from renku.core.util.metadata import get_canonical_key
296+
297+
return tuple(get_canonical_key(key) for key in self.get_credentials_names_with_no_value())
298+
288299
def get_credentials_section_name(self) -> str:
289300
"""Get section name for storing credentials.
290301
291302
NOTE: This methods should be overridden by subclasses to allow multiple credentials per providers if needed.
303+
NOTE: Values used in this method shouldn't depend on ProviderCredentials attributes since we don't have those
304+
attributes when reading credentials. It's OK to use ProviderApi attributes.
292305
"""
293306
return self.provider.name.lower() # type: ignore
294307

@@ -302,7 +315,7 @@ def read_and_convert_credentials(key) -> Union[str, NoValueType]:
302315
value = read_credentials(section=section, key=key)
303316
return NO_VALUE if value is None else value
304317

305-
data = {key: read_and_convert_credentials(key) for key in self.get_canonical_credentials_names()}
318+
data = {key: read_and_convert_credentials(key) for key in self.get_canonical_credentials_names_with_no_value()}
306319
self.data.update(data)
307320

308321
return self.data

renku/core/dataset/providers/azure.py

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright 2017-2022 - Swiss Data Science Center (SDSC)
4+
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
5+
# Eidgenössische Technische Hochschule Zürich (ETHZ).
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
"""Azure dataset provider."""
19+
20+
import urllib
21+
from typing import TYPE_CHECKING, List, Optional, Tuple, cast
22+
23+
from renku.command.command_builder import inject
24+
from renku.core import errors
25+
from renku.core.dataset.providers.api import ProviderApi, ProviderCredentials, ProviderPriority
26+
from renku.core.dataset.providers.cloud import CloudStorageAddProvider
27+
from renku.core.dataset.providers.models import ProviderParameter
28+
from renku.core.interface.storage import IStorage, IStorageFactory
29+
from renku.core.util.metadata import get_canonical_key, prompt_for_credentials
30+
from renku.core.util.urls import get_scheme
31+
from renku.domain_model.project_context import project_context
32+
33+
if TYPE_CHECKING:
34+
from renku.domain_model.dataset import Dataset
35+
36+
37+
class AzureProvider(ProviderApi, CloudStorageAddProvider):
38+
"""Azure provider."""
39+
40+
priority = ProviderPriority.HIGHEST
41+
name = "Azure"
42+
43+
def __init__(self, uri: Optional[str]):
44+
super().__init__(uri=uri)
45+
46+
account, endpoint, container, _ = parse_azure_uri(uri=self.uri)
47+
48+
self._account: str = account
49+
self._endpoint: str = endpoint
50+
self._container = container
51+
52+
@staticmethod
53+
def supports(uri: str) -> bool:
54+
"""Whether or not this provider supports a given URI."""
55+
return get_scheme(uri) == "azure"
56+
57+
@staticmethod
58+
def get_add_parameters() -> List["ProviderParameter"]:
59+
"""Returns parameters that can be set for add."""
60+
from renku.core.dataset.providers.models import ProviderParameter
61+
62+
return [
63+
ProviderParameter(
64+
"storage",
65+
flags=["storage"],
66+
default=None,
67+
help="Uri for the Azure container when creating the dataset at the same time when running 'add'",
68+
multiple=False,
69+
type=str,
70+
),
71+
]
72+
73+
def get_credentials(self) -> "AzureCredentials":
74+
"""Return an instance of provider's credential class."""
75+
return AzureCredentials(provider=self)
76+
77+
@inject.autoparams("storage_factory")
78+
def get_storage(
79+
self, storage_factory: "IStorageFactory", credentials: Optional["ProviderCredentials"] = None
80+
) -> "IStorage":
81+
"""Return the storage manager for the provider."""
82+
azure_configuration = {
83+
"type": "azureblob",
84+
"endpoint": self.endpoint,
85+
}
86+
87+
def create_renku_storage_azure_uri(uri: str) -> str:
88+
"""Create an Azure URI to work with the Renku storage handler."""
89+
_, _, container, path = parse_azure_uri(uri=uri)
90+
91+
return f"azure://{container}/{path}"
92+
93+
if not credentials:
94+
credentials = self.get_credentials()
95+
prompt_for_credentials(credentials)
96+
97+
return storage_factory.get_storage(
98+
storage_scheme="azure",
99+
provider=self,
100+
credentials=credentials,
101+
configuration=azure_configuration,
102+
uri_convertor=create_renku_storage_azure_uri,
103+
)
104+
105+
@property
106+
def account(self) -> str:
107+
"""Azure account name."""
108+
return self._account
109+
110+
@property
111+
def endpoint(self) -> str:
112+
"""Return Azure container endpoint."""
113+
return self._endpoint
114+
115+
@property
116+
def container(self) -> str:
117+
"""Return Azure container name."""
118+
return self._container
119+
120+
def on_create(self, dataset: "Dataset") -> None:
121+
"""Hook to perform provider-specific actions on a newly-created dataset."""
122+
credentials = self.get_credentials()
123+
prompt_for_credentials(credentials)
124+
storage = self.get_storage(credentials=credentials)
125+
126+
# NOTE: The underlying rclone tool cannot tell if a directory within an Azure container exists or not
127+
if not storage.exists(self.uri):
128+
raise errors.ParameterError(f"Azure container '{self.container}' doesn't exists.")
129+
130+
project_context.repository.add_ignored_pattern(pattern=str(dataset.get_datadir()))
131+
132+
133+
class AzureCredentials(ProviderCredentials):
134+
"""Azure-specific credentials."""
135+
136+
def __init__(self, provider: AzureProvider):
137+
super().__init__(provider=provider)
138+
139+
# NOTE: Set account name so that users don't need to re-enter it
140+
self.data[get_canonical_key("Account")] = self.provider.account
141+
142+
@staticmethod
143+
def get_credentials_names() -> Tuple[str, ...]:
144+
"""Return a tuple of the required credentials for a provider."""
145+
return "Account", "Key"
146+
147+
@property
148+
def provider(self) -> AzureProvider:
149+
"""Return the associated provider instance."""
150+
return cast(AzureProvider, self._provider)
151+
152+
def get_credentials_section_name(self) -> str:
153+
"""Get section name for storing credentials.
154+
155+
NOTE: This methods should be overridden by subclasses to allow multiple credentials per providers if needed.
156+
"""
157+
return f"{self.provider.account}.{self.provider.endpoint}"
158+
159+
160+
def parse_azure_uri(uri: str) -> Tuple[str, str, str, str]:
161+
"""Extract account, endpoint, container, and path within the container from a given URI.
162+
163+
NOTE: We support azure://<account-name>.<endpoint>/<container-name>/<path> or
164+
azure://<account-name>/<container-name>/<path>.
165+
"""
166+
parsed_uri = urllib.parse.urlparse(uri)
167+
168+
account, _, endpoint = parsed_uri.netloc.partition(".")
169+
170+
if parsed_uri.scheme.lower() != "azure" or not account:
171+
raise errors.ParameterError(
172+
f"Invalid Azure URI: {uri}. Valid format is 'azure://<account-name>.<endpoint>/<container-name>/<path>' or "
173+
"azure://<account-name>/<container-name>/<path>"
174+
)
175+
176+
endpoint = endpoint.lower() or "blob.core.windows.net"
177+
178+
path = parsed_uri.path.strip("/")
179+
container, _, path = path.partition("/")
180+
181+
return account, endpoint, container, path.strip("/")

renku/core/dataset/providers/cloud.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright 2017-2022 - Swiss Data Science Center (SDSC)
4+
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
5+
# Eidgenössische Technische Hochschule Zürich (ETHZ).
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
"""Common functionality for cloud storage providers."""
19+
20+
import re
21+
from pathlib import Path
22+
from typing import List
23+
24+
from renku.core import errors
25+
from renku.core.dataset.providers.api import AddProviderInterface, StorageProviderInterface
26+
from renku.core.dataset.providers.models import DatasetAddAction, DatasetAddMetadata
27+
from renku.domain_model.dataset import RemoteEntity
28+
from renku.domain_model.project_context import project_context
29+
30+
31+
class CloudStorageAddProvider(AddProviderInterface, StorageProviderInterface):
32+
"""Common AddProviderInterface for cloud providers."""
33+
34+
def add(self, uri: str, destination: Path, **kwargs) -> List["DatasetAddMetadata"]:
35+
"""Add files from a URI to a dataset."""
36+
if re.search(r"[*?]", uri):
37+
raise errors.ParameterError("Wildcards like '*' or '?' are not supported for cloud storage URIs.")
38+
39+
storage = self.get_storage()
40+
41+
destination_path_in_repo = Path(destination).relative_to(project_context.repository.path)
42+
hashes = storage.get_hashes(uri=uri)
43+
return [
44+
DatasetAddMetadata(
45+
entity_path=destination_path_in_repo / hash.path,
46+
url=hash.base_uri,
47+
action=DatasetAddAction.REMOTE_STORAGE,
48+
based_on=RemoteEntity(checksum=hash.hash if hash.hash else "", url=hash.base_uri, path=hash.path),
49+
source=Path(hash.base_uri),
50+
destination=destination_path_in_repo,
51+
provider=self,
52+
)
53+
for hash in hashes
54+
]

0 commit comments

Comments
 (0)