Skip to content

Commit 3f1e707

Browse files
feat(dataset): parallel data download/upload (#3358)
1 parent 2a461d4 commit 3f1e707

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

89 files changed

+720
-715
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ repos:
1515
- id: mixed-line-ending
1616
- id: trailing-whitespace
1717
- repo: https://github.com/psf/black
18-
rev: 22.10.0
18+
rev: 23.1.0
1919
hooks:
2020
- id: black
2121
additional_dependencies: ["click==8.0.4"]

conftest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,12 @@ def pytest_configure(config):
7979

8080
os.environ["RENKU_SKIP_MIN_VERSION_CHECK"] = "1"
8181
os.environ["RENKU_DISABLE_VERSION_CHECK"] = "1"
82+
# NOTE: Set an env var during during tests to mark that Renku is running in a test session.
83+
os.environ["RENKU_RUNNING_UNDER_TEST"] = "1"
84+
85+
86+
def pytest_unconfigure(config):
87+
"""Hook that is called by pytest after all tests are executed."""
88+
os.environ.pop("RENKU_SKIP_MIN_VERSION_CHECK", None)
89+
os.environ.pop("RENKU_DISABLE_VERSION_CHECK", None)
90+
os.environ.pop("RENKU_RUNNING_UNDER_TEST", None)

docs/reference/core.rst

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -256,11 +256,11 @@ Utilities
256256
:members:
257257
:show-inheritance:
258258

259-
.. automodule:: renku.core.util.file_size
259+
.. automodule:: renku.core.util.git
260260
:members:
261261
:show-inheritance:
262262

263-
.. automodule:: renku.core.util.git
263+
.. automodule:: renku.core.util.jwt
264264
:members:
265265
:show-inheritance:
266266

@@ -280,15 +280,19 @@ Utilities
280280
:members:
281281
:show-inheritance:
282282

283-
.. automodule:: renku.core.util.urls
283+
.. automodule:: renku.core.util.ssh
284284
:members:
285285
:show-inheritance:
286286

287-
.. automodule:: renku.core.util.util
287+
.. automodule:: renku.core.util.tabulate
288288
:members:
289289
:show-inheritance:
290290

291-
.. automodule:: renku.core.util.uuid
291+
.. automodule:: renku.core.util.urls
292+
:members:
293+
:show-inheritance:
294+
295+
.. automodule:: renku.core.util.util
292296
:members:
293297
:show-inheritance:
294298

poetry.lock

Lines changed: 156 additions & 162 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ inject = "<4.4.0,>=4.3.0"
7474
jinja2 = { version = ">=2.11.3,<3.1.3" }
7575
networkx = "<2.7,>=2.6.0"
7676
numpy = ">=1.20.0,<1.22.0"
77-
packaging = "<22.0,>=21.3"
77+
packaging = "<24.0,>=23.0"
7878
pathspec = "<1.0.0,>=0.8.0"
7979
patool = "==1.12"
8080
pluggy = "==1.0.0"
@@ -121,7 +121,7 @@ sentry-sdk = { version = ">=1.5.11,<1.5.12", extras = ["flask"], optional = tru
121121
walrus = { version = ">=0.8.2,<0.10.0", optional = true }
122122

123123
[tool.poetry.group.dev.dependencies]
124-
black = "==22.10.0"
124+
black = "==23.1.0"
125125
flake8 = ">=6.0.0,<7.0.0"
126126
Flake8-pyproject = "==1.2.2"
127127
isort = "<5.10.2,>=5.3.2"

renku/command/dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"""Repository datasets management."""
1818

1919
from renku.command.command_builder.command import Command
20-
from renku.core.constant import CONFIG_LOCAL_PATH, DATASET_METADATA_PATHS
20+
from renku.core.constant import DATASET_METADATA_PATHS
2121
from renku.core.dataset.dataset import (
2222
create_dataset,
2323
edit_dataset,
@@ -130,7 +130,7 @@ def list_tags_command():
130130
def pull_cloud_storage_command():
131131
"""Command for pulling/copying data from a cloud storage."""
132132
command = Command().command(pull_cloud_storage).lock_dataset().with_database(write=True)
133-
return command.require_migration().with_commit(commit_only=DATASET_METADATA_PATHS + [CONFIG_LOCAL_PATH])
133+
return command.require_migration().with_commit(commit_only=DATASET_METADATA_PATHS)
134134

135135

136136
def mount_cloud_storage_command(unmount: bool):

renku/core/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2017-2023- Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");

renku/core/config.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");

renku/core/constant.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2017-2023 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");

renku/core/dataset/dataset.py

Lines changed: 72 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
delete_dataset_file,
5151
delete_path,
5252
get_absolute_path,
53+
get_file_size,
5354
get_files,
5455
get_safe_relative_path,
5556
hash_file,
@@ -59,15 +60,18 @@
5960
)
6061
from renku.core.util.tabulate import tabulate
6162
from renku.core.util.urls import get_slug
62-
from renku.core.util.util import NO_VALUE, NoValueType
63+
from renku.core.util.util import parallel_execute
64+
from renku.domain_model.constant import NO_VALUE, NON_EXISTING_ENTITY_CHECKSUM, NoValueType
6365
from renku.domain_model.dataset import Dataset, DatasetDetailsJson, DatasetFile, RemoteEntity, is_dataset_name_valid
66+
from renku.domain_model.entity import Entity
6467
from renku.domain_model.enums import ConfigFilter
6568
from renku.domain_model.project_context import project_context
6669
from renku.domain_model.provenance.agent import Person
6770
from renku.domain_model.provenance.annotation import Annotation
6871
from renku.infrastructure.immutable import DynamicProxy
6972

7073
if TYPE_CHECKING:
74+
from renku.core.interface.storage import IStorage
7175
from renku.infrastructure.repository import Repository
7276

7377

@@ -1249,75 +1253,90 @@ def should_include(filepath: Path) -> bool:
12491253
return sorted(records, key=lambda r: r.date_added)
12501254

12511255

1252-
@validate_arguments(config=dict(arbitrary_types_allowed=True))
1253-
def pull_cloud_storage(name: str, location: Optional[Path] = None) -> None:
1254-
"""Pull/copy data for a cloud storage to a dataset's data directory or a specified location.
1256+
def download_file(file: DatasetFile, storage: "IStorage") -> List[DatasetFile]:
1257+
"""Download a dataset file and retrieve its missing metadata (if any).
12551258
12561259
Args:
1257-
name(str): Name of the dataset
1258-
location(Optional[Path]): A directory to copy data to (Default value = None).
1259-
"""
1260-
datasets_provenance = DatasetsProvenance()
1260+
file(DatasetFile): Dataset file to download.
1261+
storage: Dataset's cloud storage (an instance of ``IStorage``).
12611262
1262-
dataset = datasets_provenance.get_by_name(name=name, strict=True)
1263-
1264-
if not dataset.storage:
1265-
communication.warn(f"Dataset '{name}' doesn't have a storage backend")
1266-
return
1263+
Returns:
1264+
List[DatasetFile]: A list with the updated file if its metadata was missing; an empty list otherwise.
12671265
1268-
# NOTE: Try to unmount the path in case it was mounted before
1269-
unmount_path(project_context.path / dataset.get_datadir())
1266+
"""
1267+
if not file.based_on:
1268+
raise errors.DatasetImportError(f"Dataset file doesn't have a URI: {file.entity.path}")
12701269

1271-
create_symlinks = True
1272-
destination: Union[Path, str]
1270+
path = project_context.path / file.entity.path
1271+
path.parent.mkdir(parents=True, exist_ok=True)
12731272

1274-
if location:
1275-
destination = get_absolute_path(location)
1276-
else:
1277-
stored_location = read_dataset_data_location(dataset=dataset)
1278-
if stored_location:
1279-
destination = stored_location
1280-
else:
1281-
destination = project_context.path
1282-
create_symlinks = False
1273+
# NOTE: Don't check if destination file exists. ``IStorage.copy`` won't copy a file if it exists and is not
1274+
# modified.
12831275

1284-
provider = ProviderFactory.get_pull_provider(uri=dataset.storage)
1285-
storage = provider.get_storage()
1276+
communication.start_progress(name=file.entity.path, total=1)
1277+
try:
1278+
storage.download(file.based_on.url, path)
1279+
communication.update_progress(name=file.entity.path, amount=1)
1280+
finally:
1281+
communication.finalize_progress(name=file.entity.path)
12861282

1287-
updated_files = []
1283+
# NOTE: File has no missing information
1284+
if file.has_valid_checksum() and file.has_valid_size():
1285+
return []
12881286

1289-
for file in dataset.files:
1290-
path = Path(destination) / file.entity.path
1291-
path.parent.mkdir(parents=True, exist_ok=True)
1292-
# NOTE: Don't check if destination exists. ``IStorage.copy`` won't copy a file if it exists and is not modified.
1287+
if not file.has_valid_checksum():
1288+
md5_hash = hash_file(path, hash_type="md5") or NON_EXISTING_ENTITY_CHECKSUM
1289+
entity = Entity(path=file.entity.path, checksum=md5_hash)
1290+
remote_entity = RemoteEntity(checksum=md5_hash, url=file.based_on.url, path=file.based_on.path)
1291+
else:
1292+
entity = file.entity
1293+
remote_entity = file.based_on
1294+
1295+
size = file.size if file.has_valid_size() else get_file_size(path)
1296+
1297+
return [
1298+
DatasetFile(
1299+
entity=entity,
1300+
based_on=remote_entity,
1301+
size=size,
1302+
date_added=file.date_added,
1303+
date_removed=file.date_removed,
1304+
source=file.source,
1305+
)
1306+
]
12931307

1294-
if not file.based_on:
1295-
raise errors.DatasetImportError(f"Dataset file doesn't have a URI: {file.entity.path}")
12961308

1297-
with communication.busy(f"Copying {file.entity.path} ..."):
1298-
storage.download(file.based_on.url, path)
1309+
@validate_arguments(config=dict(arbitrary_types_allowed=True))
1310+
def pull_cloud_storage(name: str, location: Optional[Path] = None) -> None:
1311+
"""Pull/copy data for a cloud storage to a dataset's data directory or a specified location.
12991312
1300-
# NOTE: Make files read-only since we don't support pushing data to the remote storage
1301-
os.chmod(path, 0o400)
1313+
Args:
1314+
name(str): Name of the dataset
1315+
location(Optional[Path]): A directory to copy data to (Default value = None).
1316+
"""
1317+
dataset, datadir = _get_dataset_with_cloud_storage(name=name)
13021318

1303-
if not file.based_on.checksum:
1304-
md5_hash = hash_file(path, hash_type="md5") or ""
1305-
file.based_on = RemoteEntity(checksum=md5_hash, url=file.based_on.url, path=file.based_on.path)
1319+
# NOTE: Try to unmount the path in case it was mounted before
1320+
unmount_path(datadir)
13061321

1307-
new_file = DynamicProxy(file)
1308-
new_file.dataset = dataset
1309-
updated_files.append(new_file)
1322+
if location:
1323+
if not is_path_empty(datadir):
1324+
communication.confirm(
1325+
f"Dataset's data directory will be removed: {dataset.get_datadir()}. Do you want to continue?",
1326+
abort=True,
1327+
warning=True,
1328+
)
1329+
create_symlink(target=location, symlink_path=datadir, overwrite=True)
13101330

1311-
if create_symlinks:
1312-
symlink_path = project_context.path / file.entity.path
1313-
symlink_path.parent.mkdir(parents=True, exist_ok=True)
1314-
create_symlink(path=path, symlink_path=symlink_path, overwrite=True)
1331+
provider = ProviderFactory.get_pull_provider(uri=dataset.storage)
1332+
storage = provider.get_storage()
13151333

1316-
# NOTE: Store location in metadata in case where we want to mount the external storage in the same location
1317-
store_dataset_data_location(dataset=dataset, location=location)
1334+
updated_files = parallel_execute(download_file, dataset.files, rate=5, storage=storage)
13181335

13191336
if updated_files:
1320-
_update_datasets_files_metadata(updated_files=updated_files, deleted_files=[], delete=False)
1337+
dataset.add_or_update_files(updated_files)
1338+
DatasetsProvenance().add_or_update(dataset, creator=get_git_user(repository=project_context.repository))
1339+
project_context.database.commit()
13211340

13221341

13231342
def store_dataset_data_location(dataset: Dataset, location: Optional[Path]) -> None:
@@ -1358,7 +1377,7 @@ def mount_cloud_storage(name: str, existing: Optional[Path], yes: bool) -> None:
13581377
)
13591378

13601379
if existing:
1361-
create_symlink(path=existing, symlink_path=datadir, overwrite=True)
1380+
create_symlink(target=existing, symlink_path=datadir, overwrite=True)
13621381
return
13631382

13641383
delete_path(datadir)

0 commit comments

Comments
 (0)