@@ -532,6 +532,79 @@ def start_process(self, process_name, process_run_id=None):
532
532
None , None )
533
533
return process_run_id
534
534
535
+ def declare_file (self , value ):
536
+ # type: (Dict) -> Tuple[ProvEntity,ProvEntity,str]
537
+ if value ["class" ] != "File" :
538
+ raise ValueError ("Must have class:File" % value )
539
+ # Need to determine file hash aka RO filename
540
+ entity = None
541
+ checksum = None
542
+ if 'checksum' in value :
543
+ csum = value ['checksum' ]
544
+ (method , checksum ) = csum .split ("$" , 1 )
545
+ if method == SHA1 and \
546
+ self .research_object .has_data_file (checksum ):
547
+ entity = self .document .entity ("data:" + checksum )
548
+
549
+ if not entity and 'location' in value :
550
+ location = str (value ['location' ])
551
+ # If we made it here, we'll have to add it to the RO
552
+ assert self .research_object .make_fs_access
553
+ fsaccess = self .research_object .make_fs_access ("" )
554
+ with fsaccess .open (location , "rb" ) as fhandle :
555
+ relative_path = self .research_object .add_data_file (fhandle )
556
+ # FIXME: This naively relies on add_data_file setting hash as filename
557
+ checksum = posixpath .basename (relative_path )
558
+ entity = self .document .entity ("data:" + checksum ,
559
+ {provM .PROV_TYPE : WFPROV ["Artifact" ]})
560
+ if "checksum" not in value :
561
+ value ["checksum" ] = "%s$%s" % (SHA1 , checksum )
562
+
563
+
564
+ if not entity and 'content' in value :
565
+ # Anonymous file, add content as string
566
+ entity = self .declare_artefact (value ["content" ])
567
+ checksum = None # TODO
568
+
569
+ # By here one of them should have worked!
570
+ if not entity :
571
+ raise ValueError ("class:File but missing checksum/location/content: %r" % value )
572
+
573
+
574
+ # Track filename and extension, this is generally useful only for
575
+ # secondaryFiles. Note that multiple uses of a file might thus record
576
+ # different names for the same entity, so we'll
577
+ # make/track a specialized entity by UUID
578
+ file_id = value .setdefault ("@id" , uuid .uuid4 ().urn )
579
+ # A specialized entity that has just these names
580
+ file_entity = self .document .entity (file_id ,
581
+ [(provM .PROV_TYPE , WFPROV ["Artifact" ]),
582
+ (provM .PROV_TYPE , WF4EVER ["File" ])
583
+ ])
584
+
585
+ if "basename" in value :
586
+ file_entity .add_attributes ({CWLPROV ["basename" ]: value ["basename" ]})
587
+ if "nameroot" in value :
588
+ file_entity .add_attributes ({CWLPROV ["nameroot" ]: value ["nameroot" ]})
589
+ if "nameext" in value :
590
+ file_entity .add_attributes ({CWLPROV ["nameext" ]: value ["nameext" ]})
591
+ self .document .specializationOf (file_entity , entity )
592
+
593
+ # Check for secondaries
594
+ for sec in value .get ("secondaryFiles" , ()):
595
+ # TODO: Record these in a specializationOf entity with UUID?
596
+ (sec_entity ,_ ,_ ) = self .declare_file (sec )
597
+ # We don't know how/when/where the secondary file was generated,
598
+ # but CWL convention is a kind of summary/index derived
599
+ # from the original file. As its generally in a different format
600
+ # then prov:Quotation is not appropriate.
601
+ self .document .derivation (sec_entity , file_entity ,
602
+ other_attributes = {PROV ["type" ]: CWLPROV ["SecondaryFile" ]})
603
+ # TODO: Add to self.secondaries so it can later
604
+ # be augmented into primary-job.json
605
+
606
+ return file_entity , entity , checksum
607
+
535
608
def declare_artefact (self , value ):
536
609
# type: (Any) -> ProvEntity
537
610
'''
@@ -584,74 +657,9 @@ def declare_artefact(self, value):
584
657
585
658
# Base case - we found a File we need to update
586
659
if value .get ("class" ) == "File" :
587
- # Need to determine file hash aka RO filename
588
- entity = None
589
- if 'checksum' in value :
590
- csum = value ['checksum' ]
591
- (method , checksum ) = csum .split ("$" , 1 )
592
- if method == SHA1 and \
593
- self .research_object .has_data_file (checksum ):
594
- entity = self .document .entity ("data:" + checksum )
595
-
596
- if not entity and 'location' in value :
597
- location = str (value ['location' ])
598
- # If we made it here, we'll have to add it to the RO
599
- assert self .research_object .make_fs_access
600
- fsaccess = self .research_object .make_fs_access ("" )
601
- with fsaccess .open (location , "rb" ) as fhandle :
602
- relative_path = self .research_object .add_data_file (fhandle )
603
- # FIXME: This naively relies on add_data_file setting hash as filename
604
- checksum = posixpath .basename (relative_path )
605
- entity = self .document .entity ("data:" + checksum ,
606
- {provM .PROV_TYPE : WFPROV ["Artifact" ]})
607
- if "checksum" not in value :
608
- value ["checksum" ] = "%s$%s" % (SHA1 , checksum )
609
-
610
-
611
- if not entity and 'content' in value :
612
- # Anonymous file, add content as string
613
- entity = self .declare_artefact (value ["content" ])
614
-
615
- # By here one of them should have worked!
616
- if not entity :
617
- raise ValueError ("class:File but missing checksum/location/content: %r" % value )
618
-
619
-
620
- # Track filename and extension, this is generally useful only for
621
- # secondaryFiles. Note that multiple uses of a file might thus record
622
- # different names for the same entity, so we'll
623
- # make/track a specialized entity by UUID
624
- file_id = value .setdefault ("@id" , uuid .uuid4 ().urn )
625
- # A specialized entity that has just these names
626
- file_entity = self .document .entity (file_id ,
627
- [(provM .PROV_TYPE , WFPROV ["Artifact" ]),
628
- (provM .PROV_TYPE , WF4EVER ["File" ])
629
- ])
630
-
631
- if "basename" in value :
632
- file_entity .add_attributes ({CWLPROV ["basename" ]: value ["basename" ]})
633
- if "nameroot" in value :
634
- file_entity .add_attributes ({CWLPROV ["nameroot" ]: value ["nameroot" ]})
635
- if "nameext" in value :
636
- file_entity .add_attributes ({CWLPROV ["nameext" ]: value ["nameext" ]})
637
- self .document .specializationOf (file_entity , entity )
638
-
639
- # Check for secondaries
640
- for sec in value .get ("secondaryFiles" , ()):
641
- # TODO: Record these in a specializationOf entity with UUID?
642
- sec_entity = self .declare_artefact (sec )
643
- # We don't know how/when/where the secondary file was generated,
644
- # but CWL convention is a kind of summary/index derived
645
- # from the original file. As its generally in a different format
646
- # then prov:Quotation is not appropriate.
647
- self .document .derivation (sec_entity , file_entity ,
648
- other_attributes = {PROV ["type" ]: CWLPROV ["SecondaryFile" ]})
649
- # TODO: Add to self.secondaries so it can later
650
- # be augmented into primary-job.json
651
-
652
- # Return the UUID file_entity so that we
653
- # know which filenames were used/generated in this activity
654
- return file_entity
660
+ (entity ,_ ,_ ) = self .declare_file (value )
661
+ value ["@id" ] = entity .identifier .uri
662
+ return entity
655
663
656
664
elif value .get ("class" ) == "Directory" :
657
665
# Register any nested files/directories
@@ -1564,6 +1572,7 @@ def _relativise_files(self, structure):
1564
1572
'''
1565
1573
# Base case - we found a File we need to update
1566
1574
_logger .debug (u"[provenance] Relativising: %s" , structure )
1575
+
1567
1576
if isinstance (structure , dict ):
1568
1577
if structure .get ("class" ) == "File" and "location" in structure :
1569
1578
#standardised fs access object creation
0 commit comments