Skip to content

Commit e846435

Browse files
authored
Merge pull request #244 from LauLauThom/subcrate
suggestion for reading subcrates
2 parents 0fb7db2 + b559bfe commit e846435

File tree

14 files changed

+522
-15
lines changed

14 files changed

+522
-15
lines changed

README.md

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,83 @@ article = crate.dereference("paper.pdf")
293293

294294
## Advanced features
295295

296+
### Subcrates
297+
298+
An RO-Crate can contain one or more nested RO-Crates. For instance, consider the following layout:
299+
300+
```
301+
crate_with_subcrates/
302+
|-- file.txt
303+
|-- ro-crate-metadata.json
304+
|-- subcrate
305+
| |-- ro-crate-metadata.json
306+
| |-- subfile.txt
307+
| `-- subsubcrate
308+
| |-- deepfile.txt
309+
| `-- ro-crate-metadata.json
310+
`-- subcrate2
311+
|-- ro-crate-metadata.json
312+
`-- subfile.txt
313+
```
314+
315+
In the JSON-LD metadata, the presence of a nested crate rooted at a given directory is indicated by a `conformsTo` pointing to the generic RO-Crate profile `https://w3id.org/ro/crate` (see [Referencing other RO-Crates](https://www.researchobject.org/ro-crate/specification/1.2/data-entities.html#referencing-other-ro-crates)):
316+
317+
```json
318+
{
319+
"@id": "subcrate/",
320+
"@type": "Dataset",
321+
"conformsTo": "https://w3id.org/ro/crate"
322+
}
323+
```
324+
325+
Since nested crates can potentially contain many and / or large files, they are not loaded by default: to enable their loading, pass `load_subcrates=True` to the `RO-Crate` initializer:
326+
327+
```pycon
328+
>>> from rocrate.rocrate import ROCrate
329+
>>> crate = ROCrate("test/test-data/crate_with_subcrates", load_subcrates=True)
330+
>>> crate.subcrate_entities
331+
[<subcrate/ Dataset>, <subcrate2/ Dataset>]
332+
```
333+
334+
At this point, the nested crates have not been loaded yet. You can load a nested crate explicitly:
335+
336+
```pycon
337+
>>> nested_crate = subcrate.get_crate()
338+
>>> nested_crate.data_entities
339+
[<subfile.txt File>, <subsubcrate/ Dataset>]
340+
>>> nested_crate.subcrate_entities
341+
[<subsubcrate/ Dataset>]
342+
```
343+
344+
Alternatively, you can dereference an item from the higher level crate:
345+
346+
```pycon
347+
>>> crate.dereference("subcrate2/subfile.txt")
348+
<subfile.txt File>
349+
```
350+
351+
Up to this point, we have seen how to consume an existing RO-Crate. The following example shows how to create a new one:
352+
353+
```pycon
354+
>>> crate = ROCrate()
355+
>>> crate.add_file("test/test-data/test_file_galaxy.txt")
356+
<test_file_galaxy.txt File>
357+
>>> subcrate = crate.add_subcrate(dest_path="subcrate/")
358+
>>> subcrate
359+
<subcrate/ Dataset>
360+
>>> assert subcrate.get("conformsTo") == "https://w3id.org/ro/crate"
361+
>>> assert crate.subcrate_entities == [subcrate]
362+
>>> subcrate_crate = subcrate.get_crate()
363+
>>> subcrate_crate
364+
<rocrate.rocrate.ROCrate object at 0x7e4b79adfad0>
365+
>>> subsubcrate = subcrate_crate.add_subcrate(dest_path="subsubcrate/")
366+
>>> assert subcrate_crate.subcrate_entities == [subsubcrate]
367+
>>> subsubcrate_crate = subsubcrate.get_crate()
368+
>>> subsubf = subsubcrate_crate.add_file("setup.cfg")
369+
>>> assert crate.dereference("subcrate/subsubcrate/setup.cfg") is subsubf
370+
>>> crate.write("/tmp/crate_with_subcrates")
371+
```
372+
296373
### Modifying the crate from JSON-LD dictionaries
297374

298375
The `add_jsonld` method allows to add a contextual entity directly from a

rocrate/model/entity.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def __init__(self, crate, identifier=None, properties=None):
4343
if name.startswith("@"):
4444
self._jsonld[name] = value
4545
else:
46+
# this will call the __setitem__ method defined below
4647
self[name] = value
4748

4849
@property

rocrate/rocrate.py

Lines changed: 162 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# limitations under the License.
2222

2323
import errno
24+
from typing import cast
2425
import uuid
2526
import zipfile
2627
import atexit
@@ -74,16 +75,37 @@ def is_data_entity(entity):
7475
return DATA_ENTITY_TYPES.intersection(as_list(entity.get("@type", [])))
7576

7677

77-
def pick_type(json_entity, type_map, fallback=None):
78+
def pick_type(json_entity, type_map, fallback=None, load_subcrates=False):
7879
try:
7980
t = json_entity["@type"]
8081
except KeyError:
8182
raise ValueError(f'entity {json_entity["@id"]!r} has no @type')
8283
types = {_.strip() for _ in set(t if isinstance(t, list) else [t])}
84+
85+
entity_class = None
8386
for name, c in type_map.items():
8487
if name in types:
85-
return c
86-
return fallback
88+
entity_class = c
89+
break
90+
91+
if not entity_class:
92+
return fallback
93+
94+
if entity_class is Dataset:
95+
96+
# Check if the dataset is a Subcrate
97+
# i.e it has a conformsTo entry matching a RO-Crate profile
98+
# TODO find a better way to check the profiles?
99+
if load_subcrates and (list_profiles := get_norm_value(json_entity, "conformsTo")):
100+
101+
for profile_ref in list_profiles:
102+
if profile_ref.startswith("https://w3id.org/ro/crate"):
103+
return Subcrate
104+
105+
return Dataset
106+
107+
else:
108+
return entity_class
87109

88110

89111
def get_version(metadata_properties):
@@ -96,10 +118,16 @@ def get_version(metadata_properties):
96118

97119
class ROCrate():
98120

99-
def __init__(self, source=None, gen_preview=False, init=False, exclude=None, version=DEFAULT_VERSION):
121+
def __init__(self,
122+
source=None,
123+
gen_preview=False,
124+
init=False, exclude=None,
125+
version=DEFAULT_VERSION,
126+
load_subcrates=False):
100127
self.mode = None
101128
self.source = source
102129
self.exclude = exclude
130+
self.load_subcrates = load_subcrates
103131
self.__entity_map = {}
104132
# TODO: add this as @base in the context? At least when loading
105133
# from zip
@@ -182,6 +210,14 @@ def __read_data_entities(self, entities, source, gen_preview):
182210
self.__add_parts(parts, entities, source)
183211

184212
def __add_parts(self, parts, entities, source):
213+
"""
214+
Add entities to the crate from a list of entities id and Entity object.
215+
216+
:param self: Description
217+
:param parts: a list of dicts (one dict per entity) in the form {@id : "entity_id"}
218+
:param entities: a dict with the full list of entities information as in the hasPart of the root dataset of the crate.
219+
:param source: Description
220+
"""
185221
type_map = OrderedDict((_.__name__, _) for _ in subclasses(FileOrDir))
186222
for ref in parts:
187223
id_ = ref['@id']
@@ -192,16 +228,28 @@ def __add_parts(self, parts, entities, source):
192228
continue
193229
entity = entities.pop(id_)
194230
assert id_ == entity.pop('@id')
195-
cls = pick_type(entity, type_map, fallback=DataEntity)
196-
if cls is DataEntity:
231+
cls = pick_type(entity, type_map, fallback=DataEntity, load_subcrates=self.load_subcrates)
232+
233+
if cls is Subcrate:
234+
235+
if is_url(id_):
236+
instance = Subcrate(self, source=id_, properties=entity)
237+
else:
238+
instance = Subcrate(self, source=source / unquote(id_), properties=entity)
239+
240+
elif cls is DataEntity:
197241
instance = DataEntity(self, identifier=id_, properties=entity)
242+
198243
else:
244+
# cls is either a File or a Dataset (Directory)
199245
if is_url(id_):
200246
instance = cls(self, id_, properties=entity)
201247
else:
202248
instance = cls(self, source / unquote(id_), id_, properties=entity)
203249
self.add(instance)
204250
if instance.type == "Dataset":
251+
# for Subcrate, type is currently Dataset too,
252+
# but the hasPart is not populated yet only once accessing a subcrate element (lazy loading)
205253
self.__add_parts(as_list(entity.get("hasPart", [])), entities, source)
206254

207255
def __read_contextual_entities(self, entities):
@@ -234,6 +282,11 @@ def contextual_entities(self):
234282
if not isinstance(e, (RootDataset, Metadata, Preview))
235283
and not hasattr(e, "write")]
236284

285+
@property
286+
def subcrate_entities(self):
287+
return [e for e in self.__entity_map.values()
288+
if isinstance(e, Subcrate)]
289+
237290
@property
238291
def name(self):
239292
return self.root_dataset.get('name')
@@ -364,9 +417,31 @@ def get_entities(self):
364417
def _get_root_jsonld(self):
365418
self.root_dataset.properties()
366419

420+
def __contains__(self, entity_id):
421+
canonical_id = self.resolve_id(entity_id)
422+
return canonical_id in self.__entity_map
423+
367424
def dereference(self, entity_id, default=None):
368425
canonical_id = self.resolve_id(entity_id)
369-
return self.__entity_map.get(canonical_id, default)
426+
427+
if canonical_id in self.__entity_map:
428+
return self.__entity_map[canonical_id]
429+
430+
for subcrate_entity in self.subcrate_entities:
431+
432+
# check if the entity_id might be within a subcrate
433+
# i.e entity_id would start with a subcrate id e.g subcrate/subfile.txt
434+
if entity_id.startswith(subcrate_entity.id):
435+
436+
# replace id of subcrate to use get in the subcrate
437+
# subcrate/subfile.txt --> subfile.txt
438+
# dont use replace, as it could replace in the middle of the id
439+
entity_id_in_subcrate = entity_id[len(subcrate_entity.id):]
440+
441+
return subcrate_entity.get_crate().get(entity_id_in_subcrate, default=default)
442+
443+
# fallback
444+
return default
370445

371446
get = dereference
372447

@@ -413,6 +488,23 @@ def add_dataset(
413488
properties=properties
414489
))
415490

491+
def add_subcrate(
492+
self,
493+
source=None,
494+
dest_path=None,
495+
fetch_remote=False,
496+
validate_url=False,
497+
properties=None
498+
):
499+
return self.add(Subcrate(
500+
self,
501+
source=source,
502+
dest_path=dest_path,
503+
fetch_remote=fetch_remote,
504+
validate_url=validate_url,
505+
properties=properties
506+
))
507+
416508
add_directory = add_dataset
417509

418510
def add_tree(self, source, dest_path=None, properties=None):
@@ -492,7 +584,7 @@ def _copy_unlisted(self, top, base_path):
492584
for name in files:
493585
source = root / name
494586
rel = source.relative_to(top)
495-
if not self.dereference(str(rel)):
587+
if str(rel) not in self:
496588
dest = base_path / rel
497589
if not dest.exists() or not dest.samefile(source):
498590
shutil.copyfile(source, dest)
@@ -550,7 +642,7 @@ def _stream_zip(self, chunk_size=8192, out_path=None):
550642
continue
551643

552644
rel = source.relative_to(self.source)
553-
if not self.dereference(str(rel)) and not str(rel) in listed_files:
645+
if str(rel) not in self and not str(rel) in listed_files:
554646
with archive.open(str(rel), mode='w') as out_file, open(source, 'rb') as in_file:
555647
while chunk := in_file.read(chunk_size):
556648
out_file.write(chunk)
@@ -560,6 +652,10 @@ def _stream_zip(self, chunk_size=8192, out_path=None):
560652
while chunk := buffer.read(chunk_size):
561653
yield chunk
562654

655+
def _all_streams(self, chunk_size=8192):
656+
for writeable_entity in self.data_entities + self.default_entities:
657+
yield from writeable_entity.stream(chunk_size=chunk_size)
658+
563659
def add_workflow(
564660
self, source=None, dest_path=None, fetch_remote=False, validate_url=False, properties=None,
565661
main=False, lang="cwl", lang_version=None, gen_cwl=False, cls=ComputationalWorkflow,
@@ -782,6 +878,63 @@ def __validate_suite(self, suite):
782878
return suite
783879

784880

881+
class Subcrate(Dataset):
882+
883+
def __init__(self, crate, source=None, dest_path=None, fetch_remote=False,
884+
validate_url=False, properties=None, record_size=False):
885+
"""
886+
Data-entity representing a subcrate inside another RO-Crate.
887+
888+
:param crate: The parent crate
889+
:param source: The relative path to the subcrate, or its URL
890+
"""
891+
super().__init__(crate, source, dest_path, fetch_remote,
892+
validate_url, properties=properties, record_size=record_size)
893+
894+
self._crate = None
895+
"""
896+
A ROCrate instance allowing access to the nested RO-Crate.
897+
The nested RO-Crate is loaded on first access to any of its attribute.
898+
This attribute should not be confused with the crate attribute, which is a reference to the parent crate.
899+
Caller should rather use the get_crate() method to access the nested RO-Crate.
900+
"""
901+
902+
def _empty(self):
903+
return {
904+
"@id": self.id,
905+
"@type": "Dataset",
906+
"conformsTo": "https://w3id.org/ro/crate",
907+
}
908+
909+
def get_crate(self) -> ROCrate:
910+
"""
911+
Return the RO-Crate object referenced by this subcrate.
912+
"""
913+
if self._crate is None:
914+
self._load_subcrate()
915+
916+
return cast(ROCrate, self._crate)
917+
918+
def _load_subcrate(self):
919+
"""
920+
Load the nested RO-Crate from the source path or URL.
921+
"""
922+
if self._crate is None:
923+
# load_subcrates=True to load further nested RO-Crate (on-demand / lazily too)
924+
self._crate = ROCrate(self.source, load_subcrates=True)
925+
926+
def write(self, base_path):
927+
super().write(base_path)
928+
if self.crate.mode == Mode.CREATE:
929+
self.get_crate().write(base_path / unquote(self.id))
930+
931+
def stream(self, chunk_size=8192):
932+
yield from super().stream(chunk_size=chunk_size)
933+
if self.crate.mode == Mode.CREATE:
934+
for path, chunk in self.get_crate()._all_streams(chunk_size=chunk_size):
935+
yield os.path.join(unquote(self.id), path), chunk
936+
937+
785938
def make_workflow_rocrate(workflow_path, wf_type, include_files=[],
786939
fetch_remote=False, cwl=None, diagram=None):
787940
wf_crate = ROCrate()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
empty

0 commit comments

Comments
 (0)