Skip to content

Commit 2a461d4

Browse files
feat(dataset): external storage backend (#3323)
1 parent c8148d8 commit 2a461d4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

106 files changed

+1295
-1149
lines changed

.github/workflows/test_deploy.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,8 @@ jobs:
710710
- name: Install system packages
711711
run: |
712712
sudo apt-get update -y
713-
sudo apt-get install -y libyaml-0-2 libyaml-dev rclone fuse
713+
sudo apt-get install -y libyaml-0-2 libyaml-dev unzip fuse
714+
sudo -v ; curl https://rclone.org/install.sh | sudo bash
714715
- uses: actions/cache@master
715716
id: dependency-cache
716717
with:
@@ -807,7 +808,8 @@ jobs:
807808
- name: Install system packages
808809
run: |
809810
sudo apt-get update -y
810-
sudo apt-get install -y libyaml-0-2 libyaml-dev rclone fuse
811+
sudo apt-get install -y libyaml-0-2 libyaml-dev unzip fuse
812+
sudo -v ; curl https://rclone.org/install.sh | sudo bash
811813
- uses: actions/cache@master
812814
id: dependency-cache
813815
with:

renku/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2017-2023- Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");

renku/command/checks/__init__.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,10 +19,10 @@
2019
from .datasets import (
2120
check_dataset_files_outside_datadir,
2221
check_dataset_old_metadata_location,
22+
check_external_files,
2323
check_invalid_datasets_derivation,
2424
check_missing_files,
2525
)
26-
from .external import check_missing_external_files
2726
from .githooks import check_git_hooks_installed
2827
from .migration import check_migration
2928
from .project import check_project_id_group
@@ -43,7 +42,7 @@
4342
"check_lfs_info",
4443
"check_migrated_activity_ids",
4544
"check_migration",
46-
"check_missing_external_files",
45+
"check_external_files",
4746
"check_missing_files",
4847
"check_project_id_group",
4948
"check_project_structure",

renku/command/checks/activities.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");

renku/command/checks/datasets.py

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -70,6 +69,9 @@ def check_missing_files(dataset_gateway: IDatasetGateway, **_):
7069
missing = defaultdict(list)
7170

7271
for dataset in dataset_gateway.get_all_active_datasets():
72+
# NOTE: Datasets with storage backend don't have local copies of files
73+
if dataset.storage:
74+
continue
7375
for file_ in dataset.files:
7476
path = project_context.path / file_.entity.path
7577
file_exists = path.exists() or (file_.is_external and os.path.lexists(path))
@@ -163,7 +165,7 @@ def check_dataset_files_outside_datadir(fix, dataset_gateway: IDatasetGateway, *
163165
detected_files = []
164166

165167
for file in dataset.files:
166-
if file.is_external:
168+
if file.is_external or file.linked:
167169
continue
168170
try:
169171
get_safe_relative_path(project_context.path / file.entity.path, project_context.path / data_dir)
@@ -194,3 +196,50 @@ def check_dataset_files_outside_datadir(fix, dataset_gateway: IDatasetGateway, *
194196
return False, problems
195197

196198
return True, None
199+
200+
201+
@inject.autoparams("dataset_gateway")
202+
def check_external_files(fix, dataset_gateway: IDatasetGateway, **_):
203+
"""Find external files.
204+
205+
Args:
206+
fix: Whether to fix found issues.
207+
dataset_gateway(IDatasetGateway): The injected dataset gateway.
208+
_: keyword arguments.
209+
210+
Returns:
211+
Tuple of whether no external files are found and string of found problems.
212+
"""
213+
from renku.core.dataset.dataset import file_unlink
214+
215+
external_files = []
216+
datasets = defaultdict(list)
217+
218+
for dataset in dataset_gateway.get_all_active_datasets():
219+
for file in dataset.files:
220+
if file.is_external:
221+
external_files.append(file.entity.path)
222+
datasets[dataset.name].append(file)
223+
224+
if not external_files:
225+
return True, None
226+
227+
external_files_str = "\n\t".join(sorted(external_files))
228+
229+
if not fix:
230+
problems = (
231+
f"\n{WARNING}: External files are deprecated in favor of an external dataset backend.\n"
232+
"Use 'renku dataset rm' or rerun 'renku doctor' with '--fix' flag to remove them:\n\t"
233+
f"{external_files_str}\n"
234+
)
235+
return False, problems
236+
237+
communication.info(
238+
"The following external files were deleted from the project. You need to add them later manually using a "
239+
f"dataset with an external storage backend:\n\t{external_files_str}"
240+
)
241+
242+
for name, files in datasets.items():
243+
file_unlink(name=name, yes=True, dataset_files=files)
244+
245+
return True, None

renku/command/checks/external.py

Lines changed: 0 additions & 53 deletions
This file was deleted.

renku/command/checks/githooks.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");

renku/command/checks/migration.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");

renku/command/checks/project.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");

renku/command/checks/storage.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
#
2-
# Copyright 2020 - Swiss Data Science Center (SDSC)
3-
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
1+
# Copyright Swiss Data Science Center (SDSC). A partnership between
2+
# École Polytechnique Fédérale de Lausanne (EPFL) and
43
# Eidgenössische Technische Hochschule Zürich (ETHZ).
54
#
65
# Licensed under the Apache License, Version 2.0 (the "License");

0 commit comments

Comments
 (0)