Skip to content

Commit c948a77

Browse files
feat(dataset): import dataset at specific tags (#2926)
Co-authored-by: Ralf Grubenmann <[email protected]>
1 parent 37d50ae commit c948a77

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+929
-559
lines changed

docs/reference/models/refs.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,5 @@
1818
File References
1919
===============
2020

21-
.. automodule:: renku.domain_model.refs
21+
.. automodule:: renku.core.migration.models.refs
2222
:members:

renku/command/checks/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from .external import check_missing_external_files
2323
from .githooks import check_git_hooks_installed
2424
from .migration import check_migration
25-
from .references import check_missing_references
2625
from .storage import check_lfs_info
2726
from .validate_shacl import check_datasets_structure, check_project_structure
2827

@@ -38,6 +37,5 @@
3837
"check_migration",
3938
"check_missing_external_files",
4039
"check_missing_files",
41-
"check_missing_references",
4240
"check_project_structure",
4341
)

renku/command/checks/references.py

Lines changed: 0 additions & 51 deletions
This file was deleted.

renku/command/format/dataset_files.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def json(records, **kwargs):
157157

158158
DATASET_FILES_COLUMNS = {
159159
"added": ("date_added", "added"),
160-
"commit": ("entity.checksum", "commit"),
160+
"checksum": ("entity.checksum", "checksum"),
161161
"creators": ("creators_csv", "creators"),
162162
"creators_full": ("creators_full_csv", "creators"),
163163
"dataset": ("title", "dataset"),
@@ -167,6 +167,7 @@ def json(records, **kwargs):
167167
"dataset_name": ("dataset_name", "dataset name"),
168168
"size": ("size", None),
169169
"lfs": ("is_lfs", "lfs"),
170+
"source": ("source", None),
170171
}
171172

172173
DATASET_FILES_COLUMNS_ALIGNMENTS = {"size": "right"}

renku/command/move.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,8 @@ def _warn_about_ignored_destinations(destinations, client_dispatcher: IClientDis
175175

176176
ignored = client.find_ignored_paths(*destinations)
177177
if ignored:
178-
ignored = "\n\t".join((str(Path(p).relative_to(client.path)) for p in ignored))
179-
communication.warn(f"The following moved path match .gitignore:\n\t{ignored}")
178+
ignored_str = "\n\t".join((str(Path(p).relative_to(client.path)) for p in ignored))
179+
communication.warn(f"The following moved path match .gitignore:\n\t{ignored_str}")
180180

181181

182182
@inject.autoparams()

renku/core/dataset/constant.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,16 @@
2121

2222
from renku.core.constant import RENKU_HOME
2323
from renku.core.management.repository import RepositoryApiMixin
24-
from renku.domain_model.refs import LinkReference
2524

2625
POINTERS = "pointers"
2726
"""Directory for storing external pointer files."""
2827

2928
DATASET_IMAGES = "dataset_images"
3029
"""Directory for dataset images."""
3130

31+
REFS = "refs"
32+
"""Define a name of the folder with references in the Renku folder."""
33+
3234

3335
def renku_dataset_images_path(client):
3436
"""Return a ``Path`` instance of Renku dataset metadata folder."""
@@ -46,6 +48,6 @@ def renku_pointers_path(client):
4648
Path(RENKU_HOME) / RepositoryApiMixin.DATABASE_PATH,
4749
Path(RENKU_HOME) / DATASET_IMAGES,
4850
Path(RENKU_HOME) / POINTERS,
49-
Path(RENKU_HOME) / LinkReference.REFS,
51+
Path(RENKU_HOME) / REFS,
5052
".gitattributes",
5153
]

renku/core/dataset/dataset.py

Lines changed: 67 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ def export_dataset(name, provider_name, publish, tag, client_dispatcher: IClient
369369
except KeyError:
370370
raise errors.ParameterError("Unknown provider.")
371371

372-
provider.set_parameters(**kwargs)
372+
provider.set_export_parameters(**kwargs)
373373

374374
selected_tag = None
375375
tags = datasets_provenance.get_all_tags(dataset) # type: ignore
@@ -424,6 +424,7 @@ def import_dataset(
424424
previous_dataset=None,
425425
delete=False,
426426
gitlab_token=None,
427+
**kwargs,
427428
):
428429
"""Import data from a 3rd party provider or another renku project.
429430
@@ -449,11 +450,13 @@ def import_dataset(
449450

450451
assert provider is not None
451452

453+
provider.set_import_parameters(**kwargs)
454+
452455
try:
453456
record = provider.find_record(uri, gitlab_token=gitlab_token)
454457
provider_dataset: ProviderDataset = record.as_dataset(client)
455458
files: List[ProviderDatasetFile] = record.files_info
456-
total_size = 0
459+
total_size = 0.0
457460

458461
if not yes:
459462
communication.echo(
@@ -477,9 +480,9 @@ def import_dataset(
477480

478481
communication.confirm(text_prompt, abort=True, warning=True)
479482

480-
for file_ in files:
481-
if file_.size_in_mb is not None:
482-
total_size += file_.size_in_mb
483+
for file in files:
484+
if file.size_in_mb is not None:
485+
total_size += file.size_in_mb
483486

484487
total_size *= 2**20
485488

@@ -509,7 +512,7 @@ def import_dataset(
509512
with_metadata=provider_dataset,
510513
force=True,
511514
extract=extract,
512-
all_at_once=True,
515+
is_import=True,
513516
destination_names=names,
514517
total_size=total_size,
515518
overwrite=True,
@@ -535,39 +538,51 @@ def import_dataset(
535538
if not provider_dataset.data_dir:
536539
raise errors.OperationError(f"Data directory for dataset must be set: {provider_dataset.name}")
537540

538-
sources = []
539-
540-
if record.datadir_exists:
541-
sources = [f"{provider_dataset.data_dir}/*"]
542-
543-
for file in files:
544-
try:
545-
Path(file.path).relative_to(provider_dataset.data_dir)
546-
except ValueError: # Files that are not in dataset's data directory
547-
sources.append(file.path)
541+
if provider_dataset.version: # NOTE: A tag was specified for import
542+
sources, checksums = zip(*[(f.path, f.checksum) for f in files]) # type: ignore
543+
else:
544+
sources = [f.path for f in files] # type: ignore
545+
checksums = None
548546

549547
new_dataset = add_data_to_dataset(
550548
urls=[record.project_url],
551549
dataset_name=name,
552550
sources=sources,
551+
checksums=checksums,
553552
with_metadata=provider_dataset,
553+
is_renku_import=True,
554554
create=not previous_dataset,
555555
overwrite=True,
556556
repository=record.repository,
557557
clear_files_before=True,
558+
dataset_datadir=provider_dataset.data_dir,
559+
force=True, # NOTE: Force-add to include any ignored files
558560
)
559561

560562
if previous_dataset:
561563
_update_datasets_metadata(new_dataset, previous_dataset, delete, provider_dataset.same_as)
562564

565+
if provider_dataset.tag:
566+
add_dataset_tag(
567+
dataset_name=new_dataset.name,
568+
tag=provider_dataset.tag.name,
569+
description=provider_dataset.tag.description,
570+
)
571+
elif provider_dataset.version:
572+
add_dataset_tag(
573+
dataset_name=new_dataset.name,
574+
tag=provider_dataset.version,
575+
description=f"Tag {provider_dataset.version} created by renku import",
576+
)
577+
563578
record.import_images(new_dataset)
564579

565580
database_dispatcher.current_database.commit()
566581

567582

568583
@inject.autoparams()
569584
def update_datasets(
570-
names,
585+
names: List[str],
571586
creators,
572587
include,
573588
exclude,
@@ -594,41 +609,56 @@ def update_datasets(
594609
client_dispatcher(IClientDispatcher): Injected client dispatcher.
595610
dataset_gateway(IDatasetGateway): Injected dataset gateway.
596611
"""
612+
from renku.core.dataset.providers.renku import RenkuProvider
613+
597614
if not update_all and not names and not include and not exclude and not dry_run:
598615
raise errors.ParameterError("No update criteria is specified")
599616

600617
client = client_dispatcher.current_client
601618

602-
imported_datasets: List[Dataset] = []
619+
imported_dataset_updates: List[Dataset] = []
603620

604621
all_datasets = dataset_gateway.get_all_active_datasets()
622+
imported_datasets = [d for d in all_datasets if d.same_as]
605623

606624
if names and update_all:
607625
raise errors.ParameterError("Cannot pass dataset names when updating all datasets")
608626
elif (include or exclude) and update_all:
609627
raise errors.ParameterError("Cannot specify include and exclude filters when updating all datasets")
610-
elif (include or exclude) and names and any(d.same_as for d in all_datasets if d.name in names):
628+
elif (include or exclude) and names and any(d for d in imported_datasets if d.name in names):
611629
raise errors.IncompatibleParametersError(a="--include/--exclude", b="imported datasets")
612630

613-
names_provided = bool(names)
631+
names = names or [d.name for d in all_datasets]
614632

615633
# NOTE: update imported datasets
616634
if not include and not exclude:
617-
for dataset in all_datasets:
618-
if names and dataset.name not in names or not dataset.same_as:
635+
must_match_records = False
636+
637+
for dataset in imported_datasets:
638+
if dataset.name not in names:
619639
continue
620640

621-
uri = dataset.same_as.url
622-
if isinstance(uri, dict):
623-
uri = cast(str, uri.get("@id"))
641+
uri = dataset.same_as.value # type: ignore
624642
provider, _ = ProviderFactory.from_uri(uri)
625643

626644
if not provider:
627645
continue
628646

629647
record = provider.find_record(uri)
630648

631-
if record.is_last_version(uri) and record.version == dataset.version:
649+
if isinstance(provider, RenkuProvider) and dataset.version is not None:
650+
tags = dataset_gateway.get_all_tags(dataset=dataset)
651+
tag = next((t for t in tags if t.name == dataset.version), None)
652+
# NOTE: Do not update Renku dataset that are imported from a specific version
653+
if tag is not None and tag.dataset_id.value == dataset.id:
654+
communication.echo(
655+
f"Skipped updating imported Renku dataset '{dataset.name}' with tag '{tag.name}'"
656+
)
657+
names.remove(dataset.name)
658+
continue
659+
660+
if record.is_last_version(uri) and record.is_version_equal_to(dataset):
661+
names.remove(dataset.name)
632662
continue
633663

634664
if not dry_run:
@@ -651,25 +681,25 @@ def update_datasets(
651681

652682
communication.echo(f"Updated dataset '{dataset.name}' from remote provider")
653683

654-
if names:
655-
names.remove(dataset.name)
656-
imported_datasets.append(dataset)
684+
names.remove(dataset.name)
685+
imported_dataset_updates.append(dataset)
657686
else:
658-
imported_datasets = [d for d in all_datasets if d.same_as]
687+
must_match_records = True
659688

660-
imported_datasets_view_models = [DatasetViewModel.from_dataset(d) for d in imported_datasets]
689+
imported_dataset_updates_view_models = [DatasetViewModel.from_dataset(d) for d in imported_dataset_updates]
661690

662-
if names_provided and not names:
663-
return imported_datasets_view_models, []
691+
if not names:
692+
return imported_dataset_updates_view_models, []
664693

694+
# NOTE: Exclude all imported dataset from individual file filter
665695
records = filter_dataset_files(
666696
names=names, creators=creators, include=include, exclude=exclude, ignore=[d.name for d in imported_datasets]
667697
)
668698

669699
if not records:
670-
if imported_datasets:
671-
return imported_datasets_view_models, []
672-
raise errors.ParameterError("No files matched the criteria.")
700+
if must_match_records:
701+
raise errors.ParameterError("No files matched the criteria.")
702+
return imported_dataset_updates_view_models, []
673703

674704
git_files = []
675705
unique_remotes = set()
@@ -730,7 +760,7 @@ def update_datasets(
730760
dataset_files_view_models = [
731761
DatasetFileViewModel.from_dataset_file(cast(DatasetFile, f), f.dataset) for f in updated_files + deleted_files
732762
]
733-
return imported_datasets_view_models, dataset_files_view_models
763+
return imported_dataset_updates_view_models, dataset_files_view_models
734764

735765

736766
def show_dataset(name: str, tag: Optional[str] = None):

0 commit comments

Comments
 (0)