@@ -369,7 +369,7 @@ def export_dataset(name, provider_name, publish, tag, client_dispatcher: IClient
369
369
except KeyError :
370
370
raise errors .ParameterError ("Unknown provider." )
371
371
372
- provider .set_parameters (** kwargs )
372
+ provider .set_export_parameters (** kwargs )
373
373
374
374
selected_tag = None
375
375
tags = datasets_provenance .get_all_tags (dataset ) # type: ignore
@@ -424,6 +424,7 @@ def import_dataset(
424
424
previous_dataset = None ,
425
425
delete = False ,
426
426
gitlab_token = None ,
427
+ ** kwargs ,
427
428
):
428
429
"""Import data from a 3rd party provider or another renku project.
429
430
@@ -449,11 +450,13 @@ def import_dataset(
449
450
450
451
assert provider is not None
451
452
453
+ provider .set_import_parameters (** kwargs )
454
+
452
455
try :
453
456
record = provider .find_record (uri , gitlab_token = gitlab_token )
454
457
provider_dataset : ProviderDataset = record .as_dataset (client )
455
458
files : List [ProviderDatasetFile ] = record .files_info
456
- total_size = 0
459
+ total_size = 0.0
457
460
458
461
if not yes :
459
462
communication .echo (
@@ -477,9 +480,9 @@ def import_dataset(
477
480
478
481
communication .confirm (text_prompt , abort = True , warning = True )
479
482
480
- for file_ in files :
481
- if file_ .size_in_mb is not None :
482
- total_size += file_ .size_in_mb
483
+ for file in files :
484
+ if file .size_in_mb is not None :
485
+ total_size += file .size_in_mb
483
486
484
487
total_size *= 2 ** 20
485
488
@@ -509,7 +512,7 @@ def import_dataset(
509
512
with_metadata = provider_dataset ,
510
513
force = True ,
511
514
extract = extract ,
512
- all_at_once = True ,
515
+ is_import = True ,
513
516
destination_names = names ,
514
517
total_size = total_size ,
515
518
overwrite = True ,
@@ -535,39 +538,51 @@ def import_dataset(
535
538
if not provider_dataset .data_dir :
536
539
raise errors .OperationError (f"Data directory for dataset must be set: { provider_dataset .name } " )
537
540
538
- sources = []
539
-
540
- if record .datadir_exists :
541
- sources = [f"{ provider_dataset .data_dir } /*" ]
542
-
543
- for file in files :
544
- try :
545
- Path (file .path ).relative_to (provider_dataset .data_dir )
546
- except ValueError : # Files that are not in dataset's data directory
547
- sources .append (file .path )
541
+ if provider_dataset .version : # NOTE: A tag was specified for import
542
+ sources , checksums = zip (* [(f .path , f .checksum ) for f in files ]) # type: ignore
543
+ else :
544
+ sources = [f .path for f in files ] # type: ignore
545
+ checksums = None
548
546
549
547
new_dataset = add_data_to_dataset (
550
548
urls = [record .project_url ],
551
549
dataset_name = name ,
552
550
sources = sources ,
551
+ checksums = checksums ,
553
552
with_metadata = provider_dataset ,
553
+ is_renku_import = True ,
554
554
create = not previous_dataset ,
555
555
overwrite = True ,
556
556
repository = record .repository ,
557
557
clear_files_before = True ,
558
+ dataset_datadir = provider_dataset .data_dir ,
559
+ force = True , # NOTE: Force-add to include any ignored files
558
560
)
559
561
560
562
if previous_dataset :
561
563
_update_datasets_metadata (new_dataset , previous_dataset , delete , provider_dataset .same_as )
562
564
565
+ if provider_dataset .tag :
566
+ add_dataset_tag (
567
+ dataset_name = new_dataset .name ,
568
+ tag = provider_dataset .tag .name ,
569
+ description = provider_dataset .tag .description ,
570
+ )
571
+ elif provider_dataset .version :
572
+ add_dataset_tag (
573
+ dataset_name = new_dataset .name ,
574
+ tag = provider_dataset .version ,
575
+ description = f"Tag { provider_dataset .version } created by renku import" ,
576
+ )
577
+
563
578
record .import_images (new_dataset )
564
579
565
580
database_dispatcher .current_database .commit ()
566
581
567
582
568
583
@inject .autoparams ()
569
584
def update_datasets (
570
- names ,
585
+ names : List [ str ] ,
571
586
creators ,
572
587
include ,
573
588
exclude ,
@@ -594,41 +609,56 @@ def update_datasets(
594
609
client_dispatcher(IClientDispatcher): Injected client dispatcher.
595
610
dataset_gateway(IDatasetGateway): Injected dataset gateway.
596
611
"""
612
+ from renku .core .dataset .providers .renku import RenkuProvider
613
+
597
614
if not update_all and not names and not include and not exclude and not dry_run :
598
615
raise errors .ParameterError ("No update criteria is specified" )
599
616
600
617
client = client_dispatcher .current_client
601
618
602
- imported_datasets : List [Dataset ] = []
619
+ imported_dataset_updates : List [Dataset ] = []
603
620
604
621
all_datasets = dataset_gateway .get_all_active_datasets ()
622
+ imported_datasets = [d for d in all_datasets if d .same_as ]
605
623
606
624
if names and update_all :
607
625
raise errors .ParameterError ("Cannot pass dataset names when updating all datasets" )
608
626
elif (include or exclude ) and update_all :
609
627
raise errors .ParameterError ("Cannot specify include and exclude filters when updating all datasets" )
610
- elif (include or exclude ) and names and any (d . same_as for d in all_datasets if d .name in names ):
628
+ elif (include or exclude ) and names and any (d for d in imported_datasets if d .name in names ):
611
629
raise errors .IncompatibleParametersError (a = "--include/--exclude" , b = "imported datasets" )
612
630
613
- names_provided = bool ( names )
631
+ names = names or [ d . name for d in all_datasets ]
614
632
615
633
# NOTE: update imported datasets
616
634
if not include and not exclude :
617
- for dataset in all_datasets :
618
- if names and dataset .name not in names or not dataset .same_as :
635
+ must_match_records = False
636
+
637
+ for dataset in imported_datasets :
638
+ if dataset .name not in names :
619
639
continue
620
640
621
- uri = dataset .same_as .url
622
- if isinstance (uri , dict ):
623
- uri = cast (str , uri .get ("@id" ))
641
+ uri = dataset .same_as .value # type: ignore
624
642
provider , _ = ProviderFactory .from_uri (uri )
625
643
626
644
if not provider :
627
645
continue
628
646
629
647
record = provider .find_record (uri )
630
648
631
- if record .is_last_version (uri ) and record .version == dataset .version :
649
+ if isinstance (provider , RenkuProvider ) and dataset .version is not None :
650
+ tags = dataset_gateway .get_all_tags (dataset = dataset )
651
+ tag = next ((t for t in tags if t .name == dataset .version ), None )
652
+ # NOTE: Do not update Renku dataset that are imported from a specific version
653
+ if tag is not None and tag .dataset_id .value == dataset .id :
654
+ communication .echo (
655
+ f"Skipped updating imported Renku dataset '{ dataset .name } ' with tag '{ tag .name } '"
656
+ )
657
+ names .remove (dataset .name )
658
+ continue
659
+
660
+ if record .is_last_version (uri ) and record .is_version_equal_to (dataset ):
661
+ names .remove (dataset .name )
632
662
continue
633
663
634
664
if not dry_run :
@@ -651,25 +681,25 @@ def update_datasets(
651
681
652
682
communication .echo (f"Updated dataset '{ dataset .name } ' from remote provider" )
653
683
654
- if names :
655
- names .remove (dataset .name )
656
- imported_datasets .append (dataset )
684
+ names .remove (dataset .name )
685
+ imported_dataset_updates .append (dataset )
657
686
else :
658
- imported_datasets = [ d for d in all_datasets if d . same_as ]
687
+ must_match_records = True
659
688
660
- imported_datasets_view_models = [DatasetViewModel .from_dataset (d ) for d in imported_datasets ]
689
+ imported_dataset_updates_view_models = [DatasetViewModel .from_dataset (d ) for d in imported_dataset_updates ]
661
690
662
- if names_provided and not names :
663
- return imported_datasets_view_models , []
691
+ if not names :
692
+ return imported_dataset_updates_view_models , []
664
693
694
+ # NOTE: Exclude all imported dataset from individual file filter
665
695
records = filter_dataset_files (
666
696
names = names , creators = creators , include = include , exclude = exclude , ignore = [d .name for d in imported_datasets ]
667
697
)
668
698
669
699
if not records :
670
- if imported_datasets :
671
- return imported_datasets_view_models , []
672
- raise errors . ParameterError ( "No files matched the criteria." )
700
+ if must_match_records :
701
+ raise errors . ParameterError ( "No files matched the criteria." )
702
+ return imported_dataset_updates_view_models , []
673
703
674
704
git_files = []
675
705
unique_remotes = set ()
@@ -730,7 +760,7 @@ def update_datasets(
730
760
dataset_files_view_models = [
731
761
DatasetFileViewModel .from_dataset_file (cast (DatasetFile , f ), f .dataset ) for f in updated_files + deleted_files
732
762
]
733
- return imported_datasets_view_models , dataset_files_view_models
763
+ return imported_dataset_updates_view_models , dataset_files_view_models
734
764
735
765
736
766
def show_dataset (name : str , tag : Optional [str ] = None ):
0 commit comments