Skip to content

Commit 4382b80

Browse files
committed
type ro:Folder for directories
also export ORE resource map as separate annotation
1 parent 492d6ee commit 4382b80

File tree

3 files changed

+92
-17
lines changed

3 files changed

+92
-17
lines changed

cwltool/provenance.py

Lines changed: 87 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ class PermissionError(OSError): # pylint: disable=redefined-builtin
9393
PROVENANCE = os.path.join(METADATA, "provenance")
9494
WFDESC = Namespace("wfdesc", 'http://purl.org/wf4ever/wfdesc#')
9595
WFPROV = Namespace("wfprov", 'http://purl.org/wf4ever/wfprov#')
96+
RO = Namespace("ro", 'http://purl.org/wf4ever/ro#')
97+
ORE = Namespace("ore", 'http://www.openarchives.org/ore/terms/')
9698
FOAF = Namespace("foaf", 'http://xmlns.com/foaf/0.1/')
9799
SCHEMA = Namespace("schema", 'http://schema.org/')
98100
CWLPROV = Namespace('cwlprov', 'https://w3id.org/cwl/prov#')
@@ -373,6 +375,9 @@ def host_provenance(document):
373375

374376
# info only, won't really be used by prov as sub-resources use /
375377
self.document.add_namespace('researchobject', self.research_object.base_uri)
378+
# annotations
379+
self.metadata_ns = self.document.add_namespace('metadata',
380+
self.research_object.base_uri + _posix_path(METADATA) + "/")
376381
# Pre-register provenance directory so we can refer to its files
377382
self.provenance_ns = self.document.add_namespace('provenance',
378383
self.research_object.base_uri + _posix_path(PROVENANCE) + "/")
@@ -539,8 +544,10 @@ def declare_artefact(self, value):
539544
# FIXME: Make consistent hash URIs for these
540545
# that somehow include the type
541546
# (so "1" != 1 != "1.0" != true)
542-
return self.document.entity(uuid.uuid4().urn,
547+
e = self.document.entity(uuid.uuid4().urn,
543548
{ provM.PROV_VALUE: value })
549+
self.research_object.add_uri(e.identifier.uri)
550+
return e
544551

545552
elif isinstance(value, (Text, str)):
546553
# Save as string in UTF-8
@@ -598,37 +605,92 @@ def declare_artefact(self, value):
598605
# attempt to keep it inside the value dictionary
599606
dir_id = value.setdefault("id",
600607
uuid.uuid4().urn)
608+
609+
# New annotation file to keep the ORE Folder listing
610+
ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl"
611+
dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn])
612+
601613
coll = self.document.entity(dir_id,
602614
[ (provM.PROV_TYPE, WFPROV["Artifact"]),
603615
(provM.PROV_TYPE, PROV["Collection"]),
604616
(provM.PROV_TYPE, PROV["Dictionary"]),
605-
(provM.PROV_TYPE, CWLPROV["Directory"]),
617+
(provM.PROV_TYPE, RO["Folder"]),
606618
])
607-
coll_attribs = [] # type ( tuple(Identifier, ProvEntity) )
619+
# ORE description of ro:Folder, saved separately
620+
coll_b = dir_bundle.entity(dir_id,
621+
[
622+
(provM.PROV_TYPE, RO["Folder"]),
623+
(provM.PROV_TYPE, ORE["Aggregation"]),
624+
])
625+
self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier)
626+
627+
dir_manifest = dir_bundle.entity(dir_bundle.identifier,
628+
{PROV["type"]: ORE["ResourceMap"],
629+
ORE["describes"]: coll_b.identifier}
630+
)
631+
632+
coll_attribs = [ # type ( tuple(Identifier, ProvEntity) )
633+
(ORE["isDescribedBy"], dir_bundle.identifier )
634+
]
635+
coll_b_attribs = [] # type ( tuple(Identifier, ProvEntity) )
636+
608637
# FIXME: .listing might not be populated yet - hopefully
609638
# a later call to this method will sort that
610639
for f in value.get("listing", []):
611640
# Declare child-artifacts
612641
entity = self.declare_artefact(f)
613-
# TODO: Add filename to PROV-dictionary
614642
self.document.membership(coll, entity)
615-
# Membership
616-
m = self.document.entity(uuid.uuid4().urn)
617-
# Note: only support PROV-O style dictionary
643+
# Membership relation aka our ORE Proxy
644+
m_id = uuid.uuid4().urn
645+
m = self.document.entity(m_id)
646+
m_b = dir_bundle.entity(m_id)
647+
648+
# PROV-O style Dictionary
618649
# https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition
619-
# as prov.py do not easily allow PROV-N extensions
650+
# ..as prov.py do not currently allow PROV-N extensions
651+
# like hadDictionaryMember(..)
620652
m.add_asserted_type(PROV["KeyEntityPair"])
653+
621654
m.add_attributes({
622655
PROV["pairKey"]: f["basename"],
623-
PROV["pairEntity"]: entity
656+
PROV["pairEntity"]: entity,
657+
})
658+
659+
# As well as a being a
660+
# http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry
661+
m_b.add_asserted_type(RO["FolderEntry"])
662+
m_b.add_asserted_type(ORE["Proxy"])
663+
m_b.add_attributes({
664+
RO["entryName"]: f["basename"],
665+
ORE["proxyIn"]: coll,
666+
ORE["proxyFor"]: entity,
667+
624668
})
625669
coll_attribs.append(
626670
(PROV["hadDictionaryMember"], m))
671+
coll_b_attribs.append(
672+
(ORE["aggregates"], m_b))
673+
627674
coll.add_attributes(coll_attribs)
675+
coll_b.add_attributes(coll_b_attribs)
676+
677+
# Also Save ORE Folder as annotation metadata
678+
ore_doc = ProvDocument()
679+
ore_doc.add_namespace(ORE)
680+
ore_doc.add_namespace(RO)
681+
ore_doc.add_namespace(UUID)
682+
ore_doc.add_bundle(dir_bundle)
683+
ore_doc = ore_doc.flattened()
684+
ore_doc_path = posixpath.join(_posix_path(METADATA), ore_doc_fn)
685+
with self.research_object.write_bag_file(ore_doc_path) as provenance_file:
686+
ore_doc.serialize(provenance_file, format="rdf", rdf_format="turtle")
687+
self.research_object.add_annotation(dir_id, [ore_doc_fn], ORE["isDescribedBy"].uri)
688+
628689
if not coll_attribs:
629690
# Empty directory
630691
coll.add_asserted_type(PROV["EmptyCollection"])
631692
coll.add_asserted_type(PROV["EmptyDictionary"])
693+
self.research_object.add_uri(coll.identifier.uri)
632694
return coll
633695
else:
634696
# some other kind of dictionary?
@@ -661,6 +723,7 @@ def declare_artefact(self, value):
661723
coll_attribs.append(
662724
(PROV["hadDictionaryMember"], m))
663725
coll.add_attributes(coll_attribs)
726+
self.research_object.add_uri(coll.identifier.uri)
664727
return coll
665728

666729
# some other kind of Collection?
@@ -686,13 +749,16 @@ def declare_artefact(self, value):
686749
# we would need to use PROV.Dictionary
687750
# with numeric keys
688751
self.document.membership(coll, e)
752+
self.research_object.add_uri(coll.identifier.uri)
689753
return coll
690754
except TypeError:
691755
_logger.warning("Unrecognized type %s of %r" %
692756
(type(value), value))
693757
# Let's just fall back to Python repr()
694-
return self.document.entity(uuid.uuid4().urn,
758+
e = self.document.entity(uuid.uuid4().urn,
695759
{ provM.PROV_LABEL: repr(value) })
760+
self.research_object.add_uri(e.identifier.uri)
761+
return e
696762

697763
def used_artefacts(self,
698764
job_order, # type: Dict
@@ -909,6 +975,7 @@ def __init__(self, temp_prefix_ro="tmp", orcid=None, full_name=None):
909975
self.bagged_size = {} # type: Dict
910976
self.tagfiles = set() # type: Set
911977
self._file_provenance = {} # type: Dict
978+
self._external_aggregates = [] # type: List[Dict]
912979
self.annotations = [] # type: List[Dict]
913980
self._content_types = {} # type: Dict[Text,str]
914981

@@ -1093,7 +1160,7 @@ def guess_mediatype(rel_path):
10931160
local_aggregate["conformsTo"] = prov_conforms_to[extension]
10941161
return local_aggregate
10951162

1096-
aggregates = []
1163+
aggregates = [] # type: List[Dict]
10971164
for path in self.bagged_size.keys():
10981165
aggregate_dict = {} # type: Dict[str,Any]
10991166

@@ -1133,10 +1200,9 @@ def guess_mediatype(rel_path):
11331200
if path == posixpath.join(METADATA, "manifest.json"):
11341201
# Should not really be there yet! But anyway, we won't
11351202
# aggregate it.
1136-
11371203
continue
11381204

1139-
rel_aggregates = {}
1205+
rel_aggregates = {} # type: Dict[str,Any]
11401206
# These are local paths like metadata/provenance - but
11411207
# we need to relativize them for our current directory for
11421208
# as we are saved in metadata/manifest.json
@@ -1152,8 +1218,16 @@ def guess_mediatype(rel_path):
11521218
# make new timestamp?
11531219
rel_aggregates.update(self._self_made())
11541220
aggregates.append(rel_aggregates)
1221+
aggregates.extend(self._external_aggregates)
11551222
return aggregates
11561223

1224+
def add_uri(self, uri, when=None):
1225+
# type: (str, Optional[datetime.datetime]) -> Dict
1226+
aggr = self._self_made(when=when)
1227+
aggr["uri"] = uri
1228+
self._external_aggregates.append(aggr)
1229+
return aggr
1230+
11571231
def add_annotation(self, about, content, motivatedBy="oa:describing"):
11581232
# type: (str, List[str], str) -> str
11591233

tests/test_provenance.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
# RDF namespaces we'll query for later
2828
ORE = Namespace("http://www.openarchives.org/ore/terms/")
2929
PROV = Namespace("http://www.w3.org/ns/prov#")
30+
RO = Namespace("http://purl.org/wf4ever/ro#")
3031
WFDESC = Namespace("http://purl.org/wf4ever/wfdesc#")
3132
WFPROV = Namespace("http://purl.org/wf4ever/wfprov#")
3233
SCHEMA = Namespace("http://schema.org/")
@@ -123,7 +124,7 @@ def test_directory_workflow(self):
123124
self.assertTrue(os.path.isfile(p),
124125
"Could not find %s as %s" % (l, p))
125126

126-
def check_provenance(self, nested=False, single_tool=False, directory=True):
127+
def check_provenance(self, nested=False, single_tool=False, directory=False):
127128
self.check_folders()
128129
self.check_bagit()
129130
self.check_ro(nested=nested)
@@ -357,8 +358,8 @@ def check_prov(self, nested=False, single_tool=False, directory=False):
357358
# TODO: Check g2 statements that it's the same UUID activity inside
358359
# as in the outer step
359360
if directory:
360-
# TODO: Test directory
361-
pass
361+
directories = set(g.subjects(RDF.type, RO.Folder))
362+
self.assertTrue(directories)
362363

363364

364365
class TestConvertPath(unittest.TestCase):

typeshed/2and3/prov/model.pyi

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ class ProvDocument(ProvBundle):
245245
def has_bundles(self): ...
246246
@property
247247
def bundles(self): ...
248-
def flattened(self): ...
248+
def flattened(self) -> ProvDocument: ...
249249
def unified(self): ...
250250
def update(self, other: Any) -> None: ...
251251
def add_bundle(self, bundle: Any, identifier: Optional[Any] = ...) -> None: ...

0 commit comments

Comments
 (0)