Skip to content

Commit 55f6ac2

Browse files
committed
provenance: refactored out declare_file()
1 parent 78aeed2 commit 55f6ac2

File tree

1 file changed

+77
-68
lines changed

1 file changed

+77
-68
lines changed

cwltool/provenance.py

Lines changed: 77 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,79 @@ def start_process(self, process_name, process_run_id=None):
532532
None, None)
533533
return process_run_id
534534

535+
def declare_file(self, value):
536+
# type: (Dict) -> Tuple[ProvEntity,ProvEntity,str]
537+
if value["class"] != "File":
538+
raise ValueError("Must have class:File" % value)
539+
# Need to determine file hash aka RO filename
540+
entity = None
541+
checksum = None
542+
if 'checksum' in value:
543+
csum = value['checksum']
544+
(method, checksum) = csum.split("$", 1)
545+
if method == SHA1 and \
546+
self.research_object.has_data_file(checksum):
547+
entity = self.document.entity("data:" + checksum)
548+
549+
if not entity and 'location' in value:
550+
location = str(value['location'])
551+
# If we made it here, we'll have to add it to the RO
552+
assert self.research_object.make_fs_access
553+
fsaccess = self.research_object.make_fs_access("")
554+
with fsaccess.open(location, "rb") as fhandle:
555+
relative_path = self.research_object.add_data_file(fhandle)
556+
# FIXME: This naively relies on add_data_file setting hash as filename
557+
checksum = posixpath.basename(relative_path)
558+
entity = self.document.entity("data:" + checksum,
559+
{provM.PROV_TYPE: WFPROV["Artifact"]})
560+
if "checksum" not in value:
561+
value["checksum"] = "%s$%s" % (SHA1, checksum)
562+
563+
564+
if not entity and 'content' in value:
565+
# Anonymous file, add content as string
566+
entity = self.declare_artefact(value["content"])
567+
checksum = None # TODO
568+
569+
# By here one of them should have worked!
570+
if not entity:
571+
raise ValueError("class:File but missing checksum/location/content: %r" % value)
572+
573+
574+
# Track filename and extension, this is generally useful only for
575+
# secondaryFiles. Note that multiple uses of a file might thus record
576+
# different names for the same entity, so we'll
577+
# make/track a specialized entity by UUID
578+
file_id = value.setdefault("@id", uuid.uuid4().urn)
579+
# A specialized entity that has just these names
580+
file_entity = self.document.entity(file_id,
581+
[(provM.PROV_TYPE, WFPROV["Artifact"]),
582+
(provM.PROV_TYPE, WF4EVER["File"])
583+
])
584+
585+
if "basename" in value:
586+
file_entity.add_attributes({CWLPROV["basename"]: value["basename"]})
587+
if "nameroot" in value:
588+
file_entity.add_attributes({CWLPROV["nameroot"]: value["nameroot"]})
589+
if "nameext" in value:
590+
file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]})
591+
self.document.specializationOf(file_entity, entity)
592+
593+
# Check for secondaries
594+
for sec in value.get("secondaryFiles", ()):
595+
# TODO: Record these in a specializationOf entity with UUID?
596+
(sec_entity,_,_) = self.declare_file(sec)
597+
# We don't know how/when/where the secondary file was generated,
598+
# but CWL convention is a kind of summary/index derived
599+
# from the original file. As its generally in a different format
600+
# then prov:Quotation is not appropriate.
601+
self.document.derivation(sec_entity, file_entity,
602+
other_attributes={PROV["type"]: CWLPROV["SecondaryFile"]})
603+
# TODO: Add to self.secondaries so it can later
604+
# be augmented into primary-job.json
605+
606+
return file_entity, entity, checksum
607+
535608
def declare_artefact(self, value):
536609
# type: (Any) -> ProvEntity
537610
'''
@@ -584,74 +657,9 @@ def declare_artefact(self, value):
584657

585658
# Base case - we found a File we need to update
586659
if value.get("class") == "File":
587-
# Need to determine file hash aka RO filename
588-
entity = None
589-
if 'checksum' in value:
590-
csum = value['checksum']
591-
(method, checksum) = csum.split("$", 1)
592-
if method == SHA1 and \
593-
self.research_object.has_data_file(checksum):
594-
entity = self.document.entity("data:" + checksum)
595-
596-
if not entity and 'location' in value:
597-
location = str(value['location'])
598-
# If we made it here, we'll have to add it to the RO
599-
assert self.research_object.make_fs_access
600-
fsaccess = self.research_object.make_fs_access("")
601-
with fsaccess.open(location, "rb") as fhandle:
602-
relative_path = self.research_object.add_data_file(fhandle)
603-
# FIXME: This naively relies on add_data_file setting hash as filename
604-
checksum = posixpath.basename(relative_path)
605-
entity = self.document.entity("data:" + checksum,
606-
{provM.PROV_TYPE: WFPROV["Artifact"]})
607-
if "checksum" not in value:
608-
value["checksum"] = "%s$%s" % (SHA1, checksum)
609-
610-
611-
if not entity and 'content' in value:
612-
# Anonymous file, add content as string
613-
entity = self.declare_artefact(value["content"])
614-
615-
# By here one of them should have worked!
616-
if not entity:
617-
raise ValueError("class:File but missing checksum/location/content: %r" % value)
618-
619-
620-
# Track filename and extension, this is generally useful only for
621-
# secondaryFiles. Note that multiple uses of a file might thus record
622-
# different names for the same entity, so we'll
623-
# make/track a specialized entity by UUID
624-
file_id = value.setdefault("@id", uuid.uuid4().urn)
625-
# A specialized entity that has just these names
626-
file_entity = self.document.entity(file_id,
627-
[(provM.PROV_TYPE, WFPROV["Artifact"]),
628-
(provM.PROV_TYPE, WF4EVER["File"])
629-
])
630-
631-
if "basename" in value:
632-
file_entity.add_attributes({CWLPROV["basename"]: value["basename"]})
633-
if "nameroot" in value:
634-
file_entity.add_attributes({CWLPROV["nameroot"]: value["nameroot"]})
635-
if "nameext" in value:
636-
file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]})
637-
self.document.specializationOf(file_entity, entity)
638-
639-
# Check for secondaries
640-
for sec in value.get("secondaryFiles", ()):
641-
# TODO: Record these in a specializationOf entity with UUID?
642-
sec_entity = self.declare_artefact(sec)
643-
# We don't know how/when/where the secondary file was generated,
644-
# but CWL convention is a kind of summary/index derived
645-
# from the original file. As its generally in a different format
646-
# then prov:Quotation is not appropriate.
647-
self.document.derivation(sec_entity, file_entity,
648-
other_attributes={PROV["type"]: CWLPROV["SecondaryFile"]})
649-
# TODO: Add to self.secondaries so it can later
650-
# be augmented into primary-job.json
651-
652-
# Return the UUID file_entity so that we
653-
# know which filenames were used/generated in this activity
654-
return file_entity
660+
(entity,_,_) = self.declare_file(value)
661+
value["@id"] = entity.identifier.uri
662+
return entity
655663

656664
elif value.get("class") == "Directory":
657665
# Register any nested files/directories
@@ -1564,6 +1572,7 @@ def _relativise_files(self, structure):
15641572
'''
15651573
# Base case - we found a File we need to update
15661574
_logger.debug(u"[provenance] Relativising: %s", structure)
1575+
15671576
if isinstance(structure, dict):
15681577
if structure.get("class") == "File" and "location" in structure:
15691578
#standardised fs access object creation

0 commit comments

Comments
 (0)