Skip to content

Commit f6acdbb

Browse files
RenskeWmr-c
andauthored
Refactoring provenance.py as provenance submodule (#1775)
Co-authored-by: Michael R. Crusoe <[email protected]>
1 parent 51d91cf commit f6acdbb

18 files changed

+557
-500
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ MODULE=cwltool
2424

2525
# `SHELL=bash` doesn't work for some, so don't use BASH-isms like
2626
# `[[` conditional expressions.
27-
PYSOURCES=$(wildcard ${MODULE}/**.py tests/*.py) setup.py
27+
PYSOURCES=$(wildcard ${MODULE}/**.py cwltool/cwlprov/*.py tests/*.py) setup.py
2828
DEVPKGS=diff_cover pylint pep257 pydocstyle 'tox<4' tox-pyenv \
2929
isort wheel autoflake pyupgrade bandit -rlint-requirements.txt\
3030
-rtest-requirements.txt -rmypy-requirements.txt -rdocs/requirements.txt
@@ -85,7 +85,7 @@ docs: FORCE
8585

8686
## clean : clean up all temporary / machine-generated files
8787
clean: check-python3 FORCE
88-
rm -f ${MODULE}/*.pyc tests/*.pyc *.so ${MODULE}/*.so
88+
rm -f ${MODULE}/*.pyc tests/*.pyc *.so ${MODULE}/*.so cwltool/cwlprov/*.so
8989
rm -Rf ${MODULE}/__pycache__/
9090
python setup.py clean --all || true
9191
rm -Rf .coverage

cwltool/builder.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,10 @@
5050
)
5151

5252
if TYPE_CHECKING:
53+
from .cwlprov.provenance_profile import (
54+
ProvenanceProfile, # pylint: disable=unused-import
55+
)
5356
from .pathmapper import PathMapper
54-
from .provenance_profile import ProvenanceProfile # pylint: disable=unused-import
5557

5658
INPUT_OBJ_VOCAB: Dict[str, str] = {
5759
"Any": "https://w3id.org/cwl/salad#Any",

cwltool/command_line_tool.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,9 @@
8787
)
8888

8989
if TYPE_CHECKING:
90-
from .provenance_profile import ProvenanceProfile # pylint: disable=unused-import
90+
from .cwlprov.provenance_profile import (
91+
ProvenanceProfile, # pylint: disable=unused-import
92+
)
9193

9294

9395
class PathCheckingMode(Enum):

cwltool/context.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@
3434
from cwl_utils.parser.cwl_v1_2 import LoadingOptions
3535

3636
from .builder import Builder
37+
from .cwlprov.provenance_profile import ProvenanceProfile
38+
from .cwlprov.ro import ResearchObject
3739
from .mutation import MutationManager
3840
from .process import Process
39-
from .provenance import ResearchObject
40-
from .provenance_profile import ProvenanceProfile
4141
from .secrets import SecretStore
4242
from .software_requirements import DependenciesConfiguration
4343

cwltool/cwlprov/__init__.py

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
"""Stores Research Object including provenance."""
2+
3+
import hashlib
4+
import os
5+
import pwd
6+
import re
7+
import uuid
8+
from getpass import getuser
9+
from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union
10+
11+
from typing_extensions import TypedDict
12+
13+
14+
def _whoami() -> Tuple[str, str]:
15+
"""Return the current operating system account as (username, fullname)."""
16+
username = getuser()
17+
try:
18+
fullname = pwd.getpwuid(os.getuid())[4].split(",")[0]
19+
except (KeyError, IndexError):
20+
fullname = username
21+
22+
return (username, fullname)
23+
24+
25+
def _check_mod_11_2(numeric_string: str) -> bool:
26+
"""
27+
Validate numeric_string for its MOD-11-2 checksum.
28+
29+
Any "-" in the numeric_string are ignored.
30+
31+
The last digit of numeric_string is assumed to be the checksum, 0-9 or X.
32+
33+
See ISO/IEC 7064:2003 and
34+
https://support.orcid.org/knowledgebase/articles/116780-structure-of-the-orcid-identifier
35+
"""
36+
# Strip -
37+
nums = numeric_string.replace("-", "")
38+
total = 0
39+
# skip last (check)digit
40+
for num in nums[:-1]:
41+
digit = int(num)
42+
total = (total + digit) * 2
43+
remainder = total % 11
44+
result = (12 - remainder) % 11
45+
if result == 10:
46+
checkdigit = "X"
47+
else:
48+
checkdigit = str(result)
49+
# Compare against last digit or X
50+
return nums[-1].upper() == checkdigit
51+
52+
53+
def _valid_orcid(orcid: Optional[str]) -> str:
54+
"""
55+
Ensure orcid is a valid ORCID identifier.
56+
57+
The string must be equivalent to one of these forms:
58+
59+
0000-0002-1825-0097
60+
orcid.org/0000-0002-1825-0097
61+
http://orcid.org/0000-0002-1825-0097
62+
https://orcid.org/0000-0002-1825-0097
63+
64+
If the ORCID number or prefix is invalid, a ValueError is raised.
65+
66+
The returned ORCID string is always in the form of:
67+
https://orcid.org/0000-0002-1825-0097
68+
"""
69+
if orcid is None or not orcid:
70+
raise ValueError("ORCID cannot be unspecified")
71+
# Liberal in what we consume, e.g. ORCID.org/0000-0002-1825-009x
72+
orcid = orcid.lower()
73+
match = re.match(
74+
# Note: concatenated r"" r"" below so we can add comments to pattern
75+
# Optional hostname, with or without protocol
76+
r"(http://orcid\.org/|https://orcid\.org/|orcid\.org/)?"
77+
# alternative pattern, but probably messier
78+
# r"^((https?://)?orcid.org/)?"
79+
# ORCID number is always 4x4 numerical digits,
80+
# but last digit (modulus 11 checksum)
81+
# can also be X (but we made it lowercase above).
82+
# e.g. 0000-0002-1825-0097
83+
# or 0000-0002-1694-233x
84+
r"(?P<orcid>(\d{4}-\d{4}-\d{4}-\d{3}[0-9x]))$",
85+
orcid,
86+
)
87+
88+
help_url = (
89+
"https://support.orcid.org/knowledgebase/articles/"
90+
"116780-structure-of-the-orcid-identifier"
91+
)
92+
if not match:
93+
raise ValueError(f"Invalid ORCID: {orcid}\n{help_url}")
94+
95+
# Conservative in what we produce:
96+
# a) Ensure any checksum digit is uppercase
97+
orcid_num = match.group("orcid").upper()
98+
# b) ..and correct
99+
if not _check_mod_11_2(orcid_num):
100+
raise ValueError(f"Invalid ORCID checksum: {orcid_num}\n{help_url}")
101+
102+
# c) Re-add the official prefix https://orcid.org/
103+
return "https://orcid.org/%s" % orcid_num
104+
105+
106+
Annotation = TypedDict(
107+
"Annotation",
108+
{
109+
"uri": str,
110+
"about": str,
111+
"content": Optional[Union[str, List[str]]],
112+
"oa:motivatedBy": Dict[str, str],
113+
},
114+
)
115+
Aggregate = TypedDict(
116+
"Aggregate",
117+
{
118+
"uri": Optional[str],
119+
"bundledAs": Optional[Dict[str, Any]],
120+
"mediatype": Optional[str],
121+
"conformsTo": Optional[Union[str, List[str]]],
122+
"createdOn": Optional[str],
123+
"createdBy": Optional[Dict[str, str]],
124+
},
125+
total=False,
126+
)
127+
# Aggregate.bundledAs is actually type Aggregate, but cyclic definitions are not supported
128+
AuthoredBy = TypedDict(
129+
"AuthoredBy",
130+
{"orcid": Optional[str], "name": Optional[str], "uri": Optional[str]},
131+
total=False,
132+
)
133+
134+
135+
def checksum_copy(
136+
src_file: IO[Any],
137+
dst_file: Optional[IO[Any]] = None,
138+
hasher: Optional[Callable[[], "hashlib._Hash"]] = None,
139+
buffersize: int = 1024 * 1024,
140+
) -> str:
141+
"""Compute checksums while copying a file."""
142+
# TODO: Use hashlib.new(Hasher_str) instead?
143+
if hasher:
144+
checksum = hasher()
145+
else:
146+
from .provenance_constants import Hasher
147+
148+
checksum = Hasher()
149+
contents = src_file.read(buffersize)
150+
if dst_file and hasattr(dst_file, "name") and hasattr(src_file, "name"):
151+
temp_location = os.path.join(os.path.dirname(dst_file.name), str(uuid.uuid4()))
152+
try:
153+
os.rename(dst_file.name, temp_location)
154+
os.link(src_file.name, dst_file.name)
155+
dst_file = None
156+
os.unlink(temp_location)
157+
except OSError:
158+
pass
159+
if os.path.exists(temp_location):
160+
os.rename(temp_location, dst_file.name) # type: ignore
161+
while contents != b"":
162+
if dst_file is not None:
163+
dst_file.write(contents)
164+
checksum.update(contents)
165+
contents = src_file.read(buffersize)
166+
if dst_file is not None:
167+
dst_file.flush()
168+
return checksum.hexdigest().lower()

cwltool/provenance_constants.py renamed to cwltool/cwlprov/provenance_constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838

3939
# BagIt and YAML always use UTF-8
4040
ENCODING = "UTF-8"
41-
TEXT_PLAIN = 'text/plain; charset="%s"' % ENCODING
41+
TEXT_PLAIN = f"text/plain; charset={ENCODING!r}"
4242

4343
# sha1, compatible with the File type's "checksum" field
4444
# e.g. "checksum" = "sha1$47a013e660d408619d894b20806b1d5086aab03b"

cwltool/provenance_profile.py renamed to cwltool/cwlprov/provenance_profile.py

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,13 @@
2424
from prov.model import PROV, PROV_LABEL, PROV_TYPE, PROV_VALUE, ProvDocument, ProvEntity
2525
from schema_salad.sourceline import SourceLine
2626

27-
from .errors import WorkflowException
28-
from .job import CommandLineJob, JobBase
29-
from .loghandler import _logger
30-
from .process import Process, shortname
27+
from ..errors import WorkflowException
28+
from ..job import CommandLineJob, JobBase
29+
from ..loghandler import _logger
30+
from ..process import Process, shortname
31+
from ..stdfsaccess import StdFsAccess
32+
from ..utils import CWLObjectType, JobsType, get_listing, posix_path, versionstring
33+
from ..workflow_job import WorkflowJob
3134
from .provenance_constants import (
3235
ACCOUNT_UUID,
3336
CWLPROV,
@@ -46,12 +49,10 @@
4649
WFDESC,
4750
WFPROV,
4851
)
49-
from .stdfsaccess import StdFsAccess
50-
from .utils import CWLObjectType, JobsType, get_listing, posix_path, versionstring
51-
from .workflow_job import WorkflowJob
52+
from .writablebagfile import create_job, write_bag_file # change this later
5253

5354
if TYPE_CHECKING:
54-
from .provenance import ResearchObject
55+
from .ro import ResearchObject
5556

5657

5758
def copy_job_order(job: Union[Process, JobsType], job_order_object: CWLObjectType) -> CWLObjectType:
@@ -114,10 +115,7 @@ def __init__(
114115

115116
def __str__(self) -> str:
116117
"""Represent this Provenvance profile as a string."""
117-
return "ProvenanceProfile <{}> in <{}>".format(
118-
self.workflow_run_uri,
119-
self.research_object,
120-
)
118+
return f"ProvenanceProfile <{self.workflow_run_uri}> in <{self.research_object}>"
121119

122120
def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
123121
"""Add basic namespaces."""
@@ -140,7 +138,7 @@ def host_provenance(document: ProvDocument) -> None:
140138
},
141139
)
142140

143-
self.cwltool_version = "cwltool %s" % versionstring().split()[-1]
141+
self.cwltool_version = f"cwltool {versionstring().split()[-1]}"
144142
self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#")
145143
# document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
146144
self.document.add_namespace("wfdesc", "http://purl.org/wf4ever/wfdesc#")
@@ -240,7 +238,7 @@ def evaluate(
240238
self.prospective_prov(job)
241239
customised_job = copy_job_order(job, job_order_object)
242240
self.used_artefacts(customised_job, self.workflow_run_uri)
243-
research_obj.create_job(customised_job)
241+
create_job(research_obj, customised_job)
244242
elif hasattr(job, "workflow"):
245243
# record provenance of workflow executions
246244
self.prospective_prov(job)
@@ -460,7 +458,7 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity:
460458
ore_doc.add_bundle(dir_bundle)
461459
ore_doc = ore_doc.flattened()
462460
ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn))
463-
with self.research_object.write_bag_file(ore_doc_path) as provenance_file:
461+
with write_bag_file(self.research_object, ore_doc_path) as provenance_file:
464462
ore_doc.serialize(provenance_file, format="rdf", rdf_format="turtle")
465463
self.research_object.add_annotation(dir_id, [ore_doc_fn], ORE["isDescribedBy"].uri)
466464

@@ -477,7 +475,7 @@ def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
477475
data_file = self.research_object.add_data_file(byte_s, content_type=TEXT_PLAIN)
478476
checksum = PurePosixPath(data_file).name
479477
# FIXME: Don't naively assume add_data_file uses hash in filename!
480-
data_id = "data:%s" % PurePosixPath(data_file).stem
478+
data_id = f"data:{PurePosixPath(data_file).stem}"
481479
entity = self.document.entity(
482480
data_id, {PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value)}
483481
)
@@ -509,7 +507,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
509507
byte_s = BytesIO(value)
510508
data_file = self.research_object.add_data_file(byte_s)
511509
# FIXME: Don't naively assume add_data_file uses hash in filename!
512-
data_id = "data:%s" % PurePosixPath(data_file).stem
510+
data_id = f"data:{PurePosixPath(data_file).stem}"
513511
return self.document.entity(
514512
data_id,
515513
{PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value)},
@@ -654,7 +652,7 @@ def generate_output_prov(
654652
# FIXME: Probably not "main" in nested workflows
655653
role = self.wf_ns[f"main/{name}/{output}"]
656654
else:
657-
role = self.wf_ns["main/%s" % output]
655+
role = self.wf_ns[f"main/{output}"]
658656

659657
if not process_run_id:
660658
process_run_id = self.workflow_run_uri
@@ -738,38 +736,38 @@ def finalize_prov_profile(self, name: Optional[str]) -> List[QualifiedName]:
738736
prov_ids = []
739737

740738
# https://www.w3.org/TR/prov-xml/
741-
with self.research_object.write_bag_file(basename + ".xml") as provenance_file:
739+
with write_bag_file(self.research_object, basename + ".xml") as provenance_file:
742740
self.document.serialize(provenance_file, format="xml", indent=4)
743741
prov_ids.append(self.provenance_ns[filename + ".xml"])
744742

745743
# https://www.w3.org/TR/prov-n/
746-
with self.research_object.write_bag_file(basename + ".provn") as provenance_file:
744+
with write_bag_file(self.research_object, basename + ".provn") as provenance_file:
747745
self.document.serialize(provenance_file, format="provn", indent=2)
748746
prov_ids.append(self.provenance_ns[filename + ".provn"])
749747

750748
# https://www.w3.org/Submission/prov-json/
751-
with self.research_object.write_bag_file(basename + ".json") as provenance_file:
749+
with write_bag_file(self.research_object, basename + ".json") as provenance_file:
752750
self.document.serialize(provenance_file, format="json", indent=2)
753751
prov_ids.append(self.provenance_ns[filename + ".json"])
754752

755753
# "rdf" aka https://www.w3.org/TR/prov-o/
756754
# which can be serialized to ttl/nt/jsonld (and more!)
757755

758756
# https://www.w3.org/TR/turtle/
759-
with self.research_object.write_bag_file(basename + ".ttl") as provenance_file:
757+
with write_bag_file(self.research_object, basename + ".ttl") as provenance_file:
760758
self.document.serialize(provenance_file, format="rdf", rdf_format="turtle")
761759
prov_ids.append(self.provenance_ns[filename + ".ttl"])
762760

763761
# https://www.w3.org/TR/n-triples/
764-
with self.research_object.write_bag_file(basename + ".nt") as provenance_file:
762+
with write_bag_file(self.research_object, basename + ".nt") as provenance_file:
765763
self.document.serialize(provenance_file, format="rdf", rdf_format="ntriples")
766764
prov_ids.append(self.provenance_ns[filename + ".nt"])
767765

768766
# https://www.w3.org/TR/json-ld/
769767
# TODO: Use a nice JSON-LD context
770768
# see also https://eprints.soton.ac.uk/395985/
771769
# 404 Not Found on https://provenance.ecs.soton.ac.uk/prov.jsonld :(
772-
with self.research_object.write_bag_file(basename + ".jsonld") as provenance_file:
770+
with write_bag_file(self.research_object, basename + ".jsonld") as provenance_file:
773771
self.document.serialize(provenance_file, format="rdf", rdf_format="json-ld")
774772
prov_ids.append(self.provenance_ns[filename + ".jsonld"])
775773

0 commit comments

Comments
 (0)