|
50 | 50 | delete_dataset_file,
|
51 | 51 | delete_path,
|
52 | 52 | get_absolute_path,
|
| 53 | + get_file_size, |
53 | 54 | get_files,
|
54 | 55 | get_safe_relative_path,
|
55 | 56 | hash_file,
|
|
59 | 60 | )
|
60 | 61 | from renku.core.util.tabulate import tabulate
|
61 | 62 | from renku.core.util.urls import get_slug
|
62 |
| -from renku.core.util.util import NO_VALUE, NoValueType |
| 63 | +from renku.core.util.util import parallel_execute |
| 64 | +from renku.domain_model.constant import NO_VALUE, NON_EXISTING_ENTITY_CHECKSUM, NoValueType |
63 | 65 | from renku.domain_model.dataset import Dataset, DatasetDetailsJson, DatasetFile, RemoteEntity, is_dataset_name_valid
|
| 66 | +from renku.domain_model.entity import Entity |
64 | 67 | from renku.domain_model.enums import ConfigFilter
|
65 | 68 | from renku.domain_model.project_context import project_context
|
66 | 69 | from renku.domain_model.provenance.agent import Person
|
67 | 70 | from renku.domain_model.provenance.annotation import Annotation
|
68 | 71 | from renku.infrastructure.immutable import DynamicProxy
|
69 | 72 |
|
70 | 73 | if TYPE_CHECKING:
|
| 74 | + from renku.core.interface.storage import IStorage |
71 | 75 | from renku.infrastructure.repository import Repository
|
72 | 76 |
|
73 | 77 |
|
@@ -1249,75 +1253,90 @@ def should_include(filepath: Path) -> bool:
|
1249 | 1253 | return sorted(records, key=lambda r: r.date_added)
|
1250 | 1254 |
|
1251 | 1255 |
|
1252 |
| -@validate_arguments(config=dict(arbitrary_types_allowed=True)) |
1253 |
| -def pull_cloud_storage(name: str, location: Optional[Path] = None) -> None: |
1254 |
| - """Pull/copy data for a cloud storage to a dataset's data directory or a specified location. |
| 1256 | +def download_file(file: DatasetFile, storage: "IStorage") -> List[DatasetFile]: |
| 1257 | + """Download a dataset file and retrieve its missing metadata (if any). |
1255 | 1258 |
|
1256 | 1259 | Args:
|
1257 |
| - name(str): Name of the dataset |
1258 |
| - location(Optional[Path]): A directory to copy data to (Default value = None). |
1259 |
| - """ |
1260 |
| - datasets_provenance = DatasetsProvenance() |
| 1260 | + file(DatasetFile): Dataset file to download. |
| 1261 | + storage: Dataset's cloud storage (an instance of ``IStorage``). |
1261 | 1262 |
|
1262 |
| - dataset = datasets_provenance.get_by_name(name=name, strict=True) |
1263 |
| - |
1264 |
| - if not dataset.storage: |
1265 |
| - communication.warn(f"Dataset '{name}' doesn't have a storage backend") |
1266 |
| - return |
| 1263 | + Returns: |
| 1264 | + List[DatasetFile]: A list with the updated file if its metadata was missing; an empty list otherwise. |
1267 | 1265 |
|
1268 |
| - # NOTE: Try to unmount the path in case it was mounted before |
1269 |
| - unmount_path(project_context.path / dataset.get_datadir()) |
| 1266 | + """ |
| 1267 | + if not file.based_on: |
| 1268 | + raise errors.DatasetImportError(f"Dataset file doesn't have a URI: {file.entity.path}") |
1270 | 1269 |
|
1271 |
| - create_symlinks = True |
1272 |
| - destination: Union[Path, str] |
| 1270 | + path = project_context.path / file.entity.path |
| 1271 | + path.parent.mkdir(parents=True, exist_ok=True) |
1273 | 1272 |
|
1274 |
| - if location: |
1275 |
| - destination = get_absolute_path(location) |
1276 |
| - else: |
1277 |
| - stored_location = read_dataset_data_location(dataset=dataset) |
1278 |
| - if stored_location: |
1279 |
| - destination = stored_location |
1280 |
| - else: |
1281 |
| - destination = project_context.path |
1282 |
| - create_symlinks = False |
| 1273 | + # NOTE: Don't check if destination file exists. ``IStorage.copy`` won't copy a file if it exists and is not |
| 1274 | + # modified. |
1283 | 1275 |
|
1284 |
| - provider = ProviderFactory.get_pull_provider(uri=dataset.storage) |
1285 |
| - storage = provider.get_storage() |
| 1276 | + communication.start_progress(name=file.entity.path, total=1) |
| 1277 | + try: |
| 1278 | + storage.download(file.based_on.url, path) |
| 1279 | + communication.update_progress(name=file.entity.path, amount=1) |
| 1280 | + finally: |
| 1281 | + communication.finalize_progress(name=file.entity.path) |
1286 | 1282 |
|
1287 |
| - updated_files = [] |
| 1283 | + # NOTE: File has no missing information |
| 1284 | + if file.has_valid_checksum() and file.has_valid_size(): |
| 1285 | + return [] |
1288 | 1286 |
|
1289 |
| - for file in dataset.files: |
1290 |
| - path = Path(destination) / file.entity.path |
1291 |
| - path.parent.mkdir(parents=True, exist_ok=True) |
1292 |
| - # NOTE: Don't check if destination exists. ``IStorage.copy`` won't copy a file if it exists and is not modified. |
| 1287 | + if not file.has_valid_checksum(): |
| 1288 | + md5_hash = hash_file(path, hash_type="md5") or NON_EXISTING_ENTITY_CHECKSUM |
| 1289 | + entity = Entity(path=file.entity.path, checksum=md5_hash) |
| 1290 | + remote_entity = RemoteEntity(checksum=md5_hash, url=file.based_on.url, path=file.based_on.path) |
| 1291 | + else: |
| 1292 | + entity = file.entity |
| 1293 | + remote_entity = file.based_on |
| 1294 | + |
| 1295 | + size = file.size if file.has_valid_size() else get_file_size(path) |
| 1296 | + |
| 1297 | + return [ |
| 1298 | + DatasetFile( |
| 1299 | + entity=entity, |
| 1300 | + based_on=remote_entity, |
| 1301 | + size=size, |
| 1302 | + date_added=file.date_added, |
| 1303 | + date_removed=file.date_removed, |
| 1304 | + source=file.source, |
| 1305 | + ) |
| 1306 | + ] |
1293 | 1307 |
|
1294 |
| - if not file.based_on: |
1295 |
| - raise errors.DatasetImportError(f"Dataset file doesn't have a URI: {file.entity.path}") |
1296 | 1308 |
|
1297 |
| - with communication.busy(f"Copying {file.entity.path} ..."): |
1298 |
| - storage.download(file.based_on.url, path) |
| 1309 | +@validate_arguments(config=dict(arbitrary_types_allowed=True)) |
| 1310 | +def pull_cloud_storage(name: str, location: Optional[Path] = None) -> None: |
| 1311 | + """Pull/copy data for a cloud storage to a dataset's data directory or a specified location. |
1299 | 1312 |
|
1300 |
| - # NOTE: Make files read-only since we don't support pushing data to the remote storage |
1301 |
| - os.chmod(path, 0o400) |
| 1313 | + Args: |
| 1314 | + name(str): Name of the dataset |
| 1315 | + location(Optional[Path]): A directory to copy data to (Default value = None). |
| 1316 | + """ |
| 1317 | + dataset, datadir = _get_dataset_with_cloud_storage(name=name) |
1302 | 1318 |
|
1303 |
| - if not file.based_on.checksum: |
1304 |
| - md5_hash = hash_file(path, hash_type="md5") or "" |
1305 |
| - file.based_on = RemoteEntity(checksum=md5_hash, url=file.based_on.url, path=file.based_on.path) |
| 1319 | + # NOTE: Try to unmount the path in case it was mounted before |
| 1320 | + unmount_path(datadir) |
1306 | 1321 |
|
1307 |
| - new_file = DynamicProxy(file) |
1308 |
| - new_file.dataset = dataset |
1309 |
| - updated_files.append(new_file) |
| 1322 | + if location: |
| 1323 | + if not is_path_empty(datadir): |
| 1324 | + communication.confirm( |
| 1325 | + f"Dataset's data directory will be removed: {dataset.get_datadir()}. Do you want to continue?", |
| 1326 | + abort=True, |
| 1327 | + warning=True, |
| 1328 | + ) |
| 1329 | + create_symlink(target=location, symlink_path=datadir, overwrite=True) |
1310 | 1330 |
|
1311 |
| - if create_symlinks: |
1312 |
| - symlink_path = project_context.path / file.entity.path |
1313 |
| - symlink_path.parent.mkdir(parents=True, exist_ok=True) |
1314 |
| - create_symlink(path=path, symlink_path=symlink_path, overwrite=True) |
| 1331 | + provider = ProviderFactory.get_pull_provider(uri=dataset.storage) |
| 1332 | + storage = provider.get_storage() |
1315 | 1333 |
|
1316 |
| - # NOTE: Store location in metadata in case where we want to mount the external storage in the same location |
1317 |
| - store_dataset_data_location(dataset=dataset, location=location) |
| 1334 | + updated_files = parallel_execute(download_file, dataset.files, rate=5, storage=storage) |
1318 | 1335 |
|
1319 | 1336 | if updated_files:
|
1320 |
| - _update_datasets_files_metadata(updated_files=updated_files, deleted_files=[], delete=False) |
| 1337 | + dataset.add_or_update_files(updated_files) |
| 1338 | + DatasetsProvenance().add_or_update(dataset, creator=get_git_user(repository=project_context.repository)) |
| 1339 | + project_context.database.commit() |
1321 | 1340 |
|
1322 | 1341 |
|
1323 | 1342 | def store_dataset_data_location(dataset: Dataset, location: Optional[Path]) -> None:
|
@@ -1358,7 +1377,7 @@ def mount_cloud_storage(name: str, existing: Optional[Path], yes: bool) -> None:
|
1358 | 1377 | )
|
1359 | 1378 |
|
1360 | 1379 | if existing:
|
1361 |
| - create_symlink(path=existing, symlink_path=datadir, overwrite=True) |
| 1380 | + create_symlink(target=existing, symlink_path=datadir, overwrite=True) |
1362 | 1381 | return
|
1363 | 1382 |
|
1364 | 1383 | delete_path(datadir)
|
|
0 commit comments