Skip to content

Commit bb326d2

Browse files
feat(dataset): copy data to s3 (#3163)
1 parent aaa7722 commit bb326d2

32 files changed

+666
-537
lines changed

docs/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,7 @@
357357
("py:class", "IActivityGateway"),
358358
("py:class", "IDatasetGateway"),
359359
("py:class", "IPlanGateway"),
360+
("py:class", "IStorage"),
360361
("py:class", "IStorageFactory"),
361362
("py:class", "NoValueType"),
362363
("py:class", "OID_TYPE"),

docs/reference/gateways.rst

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,15 +68,11 @@ Implementation of Gateway interfaces.
6868
:members:
6969
:show-inheritance:
7070

71-
.. automodule:: renku.infrastructure.storage.base
72-
:members:
73-
:show-inheritance:
74-
7571
.. automodule:: renku.infrastructure.storage.factory
7672
:members:
7773
:show-inheritance:
7874

79-
.. automodule:: renku.infrastructure.storage.s3
75+
.. automodule:: renku.infrastructure.storage.rclone
8076
:members:
8177
:show-inheritance:
8278

docs/spelling_wordlist.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ installable
108108
interoperability
109109
io
110110
ipynb
111+
IStorage
111112
iteratively
112113
ize
113114
izing

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ addopts = "--flake8 --black --doctest-glob=\"*.rst\" --doctest-modules --cov --c
321321
doctest_optionflags = "ALLOW_UNICODE"
322322
flake8-ignore = ["*.py", "E121", "E126", "E203", "E226", "E231", "W503", "W504", "docs/conf.py", "docs/cheatsheet/conf.py", "ALL"]
323323
flake8-max-line-length = 120
324-
testpaths = ["docs", "tests", "renku", "conftest.py"]
324+
testpaths = ["docs", "tests", "conftest.py"]
325325
markers = [
326326
"integration: mark a test as a integration.",
327327
"jobs: mark a test as a job test.",

renku/core/dataset/dataset.py

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
from renku.core.dataset.request_model import ImageRequestModel
3838
from renku.core.dataset.tag import get_dataset_by_tag, prompt_access_token, prompt_tag_selection
3939
from renku.core.interface.dataset_gateway import IDatasetGateway
40-
from renku.core.interface.storage import IStorageFactory
4140
from renku.core.storage import check_external_storage, pull_paths_from_storage, track_paths_in_storage
4241
from renku.core.util import communication
4342
from renku.core.util.datetime8601 import local_now
@@ -399,7 +398,7 @@ def export_dataset(name, provider_name, tag, **kwargs):
399398

400399
dataset: Optional[Dataset] = datasets_provenance.get_by_name(name, strict=True, immutable=True)
401400

402-
provider = ProviderFactory.from_name(provider_name)
401+
provider = ProviderFactory.get_export_provider(provider_name=provider_name)
403402

404403
selected_tag = None
405404
tags = datasets_provenance.get_all_tags(dataset) # type: ignore
@@ -879,11 +878,7 @@ def update_dataset_custom_metadata(
879878

880879

881880
@inject.autoparams("dataset_gateway")
882-
def move_files(
883-
dataset_gateway: IDatasetGateway,
884-
files: Dict[Path, Path],
885-
to_dataset_name: Optional[str] = None,
886-
):
881+
def move_files(dataset_gateway: IDatasetGateway, files: Dict[Path, Path], to_dataset_name: Optional[str] = None):
887882
"""Move files and their metadata from one or more datasets to a target dataset.
888883
889884
Args:
@@ -1222,13 +1217,11 @@ def should_include(filepath: Path) -> bool:
12221217
return sorted(records, key=lambda r: r.date_added)
12231218

12241219

1225-
@inject.autoparams("storage_factory")
1226-
def pull_external_data(name: str, storage_factory: IStorageFactory, location: Optional[Path] = None) -> None:
1220+
def pull_external_data(name: str, location: Optional[Path] = None) -> None:
12271221
"""Pull/copy data for an external storage to a dataset's data directory or a specified location.
12281222
12291223
Args:
12301224
name(str): Name of the dataset
1231-
storage_factory(IStorageFactory):Injected storage factory.
12321225
location(Optional[Path]): A directory to copy data to (Default value = None).
12331226
"""
12341227
datasets_provenance = DatasetsProvenance()
@@ -1256,28 +1249,25 @@ def pull_external_data(name: str, storage_factory: IStorageFactory, location: Op
12561249
create_symlinks = False
12571250

12581251
provider = ProviderFactory.get_pull_provider(uri=dataset.storage)
1252+
storage = provider.get_storage()
12591253

1260-
credentials = S3Credentials(provider)
1261-
prompt_for_credentials(credentials)
1262-
1263-
storage = storage_factory.get_storage(provider=provider, credentials=credentials)
12641254
updated_files = []
12651255

12661256
for file in dataset.files:
12671257
path = Path(destination) / file.entity.path
12681258
path.parent.mkdir(parents=True, exist_ok=True)
12691259
# NOTE: Don't check if destination exists. ``IStorage.copy`` won't copy a file if it exists and is not modified.
12701260

1271-
if not file.source:
1261+
if not file.based_on:
12721262
raise errors.DatasetImportError(f"Dataset file doesn't have a URI: {file.entity.path}")
12731263

12741264
with communication.busy(f"Copying {file.entity.path} ..."):
1275-
storage.copy(file.source, path)
1265+
storage.download(file.based_on.url, path)
12761266

12771267
# NOTE: Make files read-only since we don't support pushing data to the remote storage
12781268
os.chmod(path, 0o400)
12791269

1280-
if file.based_on and not file.based_on.checksum:
1270+
if not file.based_on.checksum:
12811271
md5_hash = hash_file(path, hash_type="md5") or ""
12821272
file.based_on = RemoteEntity(checksum=md5_hash, url=file.based_on.url, path=file.based_on.path)
12831273

@@ -1313,20 +1303,13 @@ def read_dataset_data_location(dataset: Dataset) -> Optional[str]:
13131303
return get_value(section="dataset-locations", key=dataset.name, config_filter=ConfigFilter.LOCAL_ONLY)
13141304

13151305

1316-
@inject.autoparams("storage_factory")
1317-
def mount_external_storage(
1318-
name: str,
1319-
existing: Optional[Path],
1320-
yes: bool,
1321-
storage_factory: IStorageFactory,
1322-
) -> None:
1306+
def mount_external_storage(name: str, existing: Optional[Path], yes: bool) -> None:
13231307
"""Mount an external storage to a dataset's data directory.
13241308
13251309
Args:
13261310
name(str): Name of the dataset
13271311
existing(Optional[Path]): An existing mount point to use instead of actually mounting the external storage.
13281312
yes(bool): Don't prompt when removing non-empty dataset's data directory.
1329-
storage_factory(IStorageFactory): Injected storage factory.
13301313
"""
13311314
dataset, datadir = _get_dataset_with_external_storage(name=name)
13321315

@@ -1350,8 +1333,8 @@ def mount_external_storage(
13501333
provider = ProviderFactory.get_mount_provider(uri=dataset.storage)
13511334
credentials = S3Credentials(provider)
13521335
prompt_for_credentials(credentials)
1336+
storage = provider.get_storage(credentials=credentials)
13531337

1354-
storage = storage_factory.get_storage(provider=provider, credentials=credentials)
13551338
with communication.busy(f"Mounting {provider.uri}"):
13561339
storage.mount(datadir)
13571340

0 commit comments

Comments
 (0)