2121# limitations under the License.
2222
2323import errno
24+ from typing import cast
2425import uuid
2526import zipfile
2627import atexit
@@ -74,16 +75,37 @@ def is_data_entity(entity):
7475 return DATA_ENTITY_TYPES .intersection (as_list (entity .get ("@type" , [])))
7576
7677
77- def pick_type (json_entity , type_map , fallback = None ):
78+ def pick_type (json_entity , type_map , fallback = None , load_subcrates = False ):
7879 try :
7980 t = json_entity ["@type" ]
8081 except KeyError :
8182 raise ValueError (f'entity { json_entity ["@id" ]!r} has no @type' )
8283 types = {_ .strip () for _ in set (t if isinstance (t , list ) else [t ])}
84+
85+ entity_class = None
8386 for name , c in type_map .items ():
8487 if name in types :
85- return c
86- return fallback
88+ entity_class = c
89+ break
90+
91+ if not entity_class :
92+ return fallback
93+
94+ if entity_class is Dataset :
95+
96+ # Check if the dataset is a Subcrate
97+ # i.e it has a conformsTo entry matching a RO-Crate profile
98+ # TODO find a better way to check the profiles?
99+ if load_subcrates and (list_profiles := get_norm_value (json_entity , "conformsTo" )):
100+
101+ for profile_ref in list_profiles :
102+ if profile_ref .startswith ("https://w3id.org/ro/crate" ):
103+ return Subcrate
104+
105+ return Dataset
106+
107+ else :
108+ return entity_class
87109
88110
89111def get_version (metadata_properties ):
@@ -96,10 +118,16 @@ def get_version(metadata_properties):
96118
97119class ROCrate ():
98120
99- def __init__ (self , source = None , gen_preview = False , init = False , exclude = None , version = DEFAULT_VERSION ):
121+ def __init__ (self ,
122+ source = None ,
123+ gen_preview = False ,
124+ init = False , exclude = None ,
125+ version = DEFAULT_VERSION ,
126+ load_subcrates = False ):
100127 self .mode = None
101128 self .source = source
102129 self .exclude = exclude
130+ self .load_subcrates = load_subcrates
103131 self .__entity_map = {}
104132 # TODO: add this as @base in the context? At least when loading
105133 # from zip
@@ -182,6 +210,14 @@ def __read_data_entities(self, entities, source, gen_preview):
182210 self .__add_parts (parts , entities , source )
183211
184212 def __add_parts (self , parts , entities , source ):
213+ """
214+ Add entities to the crate from a list of entities id and Entity object.
215+
216+ :param self: Description
217+ :param parts: a list of dicts (one dict per entity) in the form {@id : "entity_id"}
218+ :param entities: a dict with the full list of entities information as in the hasPart of the root dataset of the crate.
219+ :param source: Description
220+ """
185221 type_map = OrderedDict ((_ .__name__ , _ ) for _ in subclasses (FileOrDir ))
186222 for ref in parts :
187223 id_ = ref ['@id' ]
@@ -192,16 +228,28 @@ def __add_parts(self, parts, entities, source):
192228 continue
193229 entity = entities .pop (id_ )
194230 assert id_ == entity .pop ('@id' )
195- cls = pick_type (entity , type_map , fallback = DataEntity )
196- if cls is DataEntity :
231+ cls = pick_type (entity , type_map , fallback = DataEntity , load_subcrates = self .load_subcrates )
232+
233+ if cls is Subcrate :
234+
235+ if is_url (id_ ):
236+ instance = Subcrate (self , source = id_ , properties = entity )
237+ else :
238+ instance = Subcrate (self , source = source / unquote (id_ ), properties = entity )
239+
240+ elif cls is DataEntity :
197241 instance = DataEntity (self , identifier = id_ , properties = entity )
242+
198243 else :
244+ # cls is either a File or a Dataset (Directory)
199245 if is_url (id_ ):
200246 instance = cls (self , id_ , properties = entity )
201247 else :
202248 instance = cls (self , source / unquote (id_ ), id_ , properties = entity )
203249 self .add (instance )
204250 if instance .type == "Dataset" :
251+ # for Subcrate, type is currently Dataset too,
252+ # but the hasPart is not populated yet only once accessing a subcrate element (lazy loading)
205253 self .__add_parts (as_list (entity .get ("hasPart" , [])), entities , source )
206254
207255 def __read_contextual_entities (self , entities ):
@@ -234,6 +282,11 @@ def contextual_entities(self):
234282 if not isinstance (e , (RootDataset , Metadata , Preview ))
235283 and not hasattr (e , "write" )]
236284
285+ @property
286+ def subcrate_entities (self ):
287+ return [e for e in self .__entity_map .values ()
288+ if isinstance (e , Subcrate )]
289+
237290 @property
238291 def name (self ):
239292 return self .root_dataset .get ('name' )
@@ -364,9 +417,31 @@ def get_entities(self):
364417 def _get_root_jsonld (self ):
365418 self .root_dataset .properties ()
366419
420+ def __contains__ (self , entity_id ):
421+ canonical_id = self .resolve_id (entity_id )
422+ return canonical_id in self .__entity_map
423+
367424 def dereference (self , entity_id , default = None ):
368425 canonical_id = self .resolve_id (entity_id )
369- return self .__entity_map .get (canonical_id , default )
426+
427+ if canonical_id in self .__entity_map :
428+ return self .__entity_map [canonical_id ]
429+
430+ for subcrate_entity in self .subcrate_entities :
431+
432+ # check if the entity_id might be within a subcrate
433+ # i.e entity_id would start with a subcrate id e.g subcrate/subfile.txt
434+ if entity_id .startswith (subcrate_entity .id ):
435+
436+ # replace id of subcrate to use get in the subcrate
437+ # subcrate/subfile.txt --> subfile.txt
438+ # dont use replace, as it could replace in the middle of the id
439+ entity_id_in_subcrate = entity_id [len (subcrate_entity .id ):]
440+
441+ return subcrate_entity .get_crate ().get (entity_id_in_subcrate , default = default )
442+
443+ # fallback
444+ return default
370445
371446 get = dereference
372447
@@ -413,6 +488,23 @@ def add_dataset(
413488 properties = properties
414489 ))
415490
491+ def add_subcrate (
492+ self ,
493+ source = None ,
494+ dest_path = None ,
495+ fetch_remote = False ,
496+ validate_url = False ,
497+ properties = None
498+ ):
499+ return self .add (Subcrate (
500+ self ,
501+ source = source ,
502+ dest_path = dest_path ,
503+ fetch_remote = fetch_remote ,
504+ validate_url = validate_url ,
505+ properties = properties
506+ ))
507+
416508 add_directory = add_dataset
417509
418510 def add_tree (self , source , dest_path = None , properties = None ):
@@ -492,7 +584,7 @@ def _copy_unlisted(self, top, base_path):
492584 for name in files :
493585 source = root / name
494586 rel = source .relative_to (top )
495- if not self . dereference ( str (rel )) :
587+ if str (rel ) not in self :
496588 dest = base_path / rel
497589 if not dest .exists () or not dest .samefile (source ):
498590 shutil .copyfile (source , dest )
@@ -550,7 +642,7 @@ def _stream_zip(self, chunk_size=8192, out_path=None):
550642 continue
551643
552644 rel = source .relative_to (self .source )
553- if not self . dereference ( str (rel )) and not str (rel ) in listed_files :
645+ if str (rel ) not in self and not str (rel ) in listed_files :
554646 with archive .open (str (rel ), mode = 'w' ) as out_file , open (source , 'rb' ) as in_file :
555647 while chunk := in_file .read (chunk_size ):
556648 out_file .write (chunk )
@@ -560,6 +652,10 @@ def _stream_zip(self, chunk_size=8192, out_path=None):
560652 while chunk := buffer .read (chunk_size ):
561653 yield chunk
562654
655+ def _all_streams (self , chunk_size = 8192 ):
656+ for writeable_entity in self .data_entities + self .default_entities :
657+ yield from writeable_entity .stream (chunk_size = chunk_size )
658+
563659 def add_workflow (
564660 self , source = None , dest_path = None , fetch_remote = False , validate_url = False , properties = None ,
565661 main = False , lang = "cwl" , lang_version = None , gen_cwl = False , cls = ComputationalWorkflow ,
@@ -782,6 +878,63 @@ def __validate_suite(self, suite):
782878 return suite
783879
784880
881+ class Subcrate (Dataset ):
882+
883+ def __init__ (self , crate , source = None , dest_path = None , fetch_remote = False ,
884+ validate_url = False , properties = None , record_size = False ):
885+ """
886+ Data-entity representing a subcrate inside another RO-Crate.
887+
888+ :param crate: The parent crate
889+ :param source: The relative path to the subcrate, or its URL
890+ """
891+ super ().__init__ (crate , source , dest_path , fetch_remote ,
892+ validate_url , properties = properties , record_size = record_size )
893+
894+ self ._crate = None
895+ """
896+ A ROCrate instance allowing access to the nested RO-Crate.
897+ The nested RO-Crate is loaded on first access to any of its attribute.
898+ This attribute should not be confused with the crate attribute, which is a reference to the parent crate.
899+ Caller should rather use the get_crate() method to access the nested RO-Crate.
900+ """
901+
902+ def _empty (self ):
903+ return {
904+ "@id" : self .id ,
905+ "@type" : "Dataset" ,
906+ "conformsTo" : "https://w3id.org/ro/crate" ,
907+ }
908+
909+ def get_crate (self ) -> ROCrate :
910+ """
911+ Return the RO-Crate object referenced by this subcrate.
912+ """
913+ if self ._crate is None :
914+ self ._load_subcrate ()
915+
916+ return cast (ROCrate , self ._crate )
917+
918+ def _load_subcrate (self ):
919+ """
920+ Load the nested RO-Crate from the source path or URL.
921+ """
922+ if self ._crate is None :
923+ # load_subcrates=True to load further nested RO-Crate (on-demand / lazily too)
924+ self ._crate = ROCrate (self .source , load_subcrates = True )
925+
926+ def write (self , base_path ):
927+ super ().write (base_path )
928+ if self .crate .mode == Mode .CREATE :
929+ self .get_crate ().write (base_path / unquote (self .id ))
930+
931+ def stream (self , chunk_size = 8192 ):
932+ yield from super ().stream (chunk_size = chunk_size )
933+ if self .crate .mode == Mode .CREATE :
934+ for path , chunk in self .get_crate ()._all_streams (chunk_size = chunk_size ):
935+ yield os .path .join (unquote (self .id ), path ), chunk
936+
937+
785938def make_workflow_rocrate (workflow_path , wf_type , include_files = [],
786939 fetch_remote = False , cwl = None , diagram = None ):
787940 wf_crate = ROCrate ()
0 commit comments