common-workflow-language
diff --git a/‎Makefile
Lines changed: 2 additions & 2 deletions b/‎Makefile
Lines changed: 2 additions & 2 deletions
diff --git a/‎cwltool/builder.py
Lines changed: 3 additions & 1 deletion b/‎cwltool/builder.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎cwltool/command_line_tool.py
Lines changed: 3 additions & 1 deletion b/‎cwltool/command_line_tool.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎cwltool/context.py
Lines changed: 2 additions & 2 deletions b/‎cwltool/context.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎cwltool/cwlprov/__init__.py
Lines changed: 168 additions & 0 deletions b/‎cwltool/cwlprov/__init__.py
Lines changed: 168 additions & 0 deletions
diff --git a/‎cwltool/provenance_constants.py renamed to ‎cwltool/cwlprov/provenance_constants.py
Lines changed: 1 addition & 1 deletion b/‎cwltool/provenance_constants.py renamed to ‎cwltool/cwlprov/provenance_constants.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎cwltool/provenance_profile.py renamed to ‎cwltool/cwlprov/provenance_profile.py
Lines changed: 22 additions & 24 deletions b/‎cwltool/provenance_profile.py renamed to ‎cwltool/cwlprov/provenance_profile.py
Lines changed: 22 additions & 24 deletions
@@ -24,7 +24,7 @@ MODULE=cwltool
 
 # `SHELL=bash` doesn't work for some, so don't use BASH-isms like
 # `[[` conditional expressions.
-PYSOURCES=$(wildcard ${MODULE}/**.py tests/*.py) setup.py
+PYSOURCES=$(wildcard ${MODULE}/**.py cwltool/cwlprov/*.py tests/*.py) setup.py
 DEVPKGS=diff_cover pylint pep257 pydocstyle 'tox<4' tox-pyenv \
 	isort wheel autoflake pyupgrade bandit -rlint-requirements.txt\
 	-rtest-requirements.txt -rmypy-requirements.txt -rdocs/requirements.txt
@@ -85,7 +85,7 @@ docs: FORCE
 
 ## clean                  : clean up all temporary / machine-generated files
 clean: check-python3 FORCE
-	rm -f ${MODULE}/*.pyc tests/*.pyc *.so ${MODULE}/*.so
+	rm -f ${MODULE}/*.pyc tests/*.pyc *.so ${MODULE}/*.so cwltool/cwlprov/*.so
 	rm -Rf ${MODULE}/__pycache__/
 	python setup.py clean --all || true
 	rm -Rf .coverage
 
@@ -50,8 +50,10 @@
 )
 
 if TYPE_CHECKING:
+    from .cwlprov.provenance_profile import (
+        ProvenanceProfile,  # pylint: disable=unused-import
+    )
     from .pathmapper import PathMapper
-    from .provenance_profile import ProvenanceProfile  # pylint: disable=unused-import
 
 INPUT_OBJ_VOCAB: Dict[str, str] = {
     "Any": "https://w3id.org/cwl/salad#Any",
 
@@ -87,7 +87,9 @@
 )
 
 if TYPE_CHECKING:
-    from .provenance_profile import ProvenanceProfile  # pylint: disable=unused-import
+    from .cwlprov.provenance_profile import (
+        ProvenanceProfile,  # pylint: disable=unused-import
+    )
 
 
 class PathCheckingMode(Enum):
 
@@ -34,10 +34,10 @@
     from cwl_utils.parser.cwl_v1_2 import LoadingOptions
 
     from .builder import Builder
+    from .cwlprov.provenance_profile import ProvenanceProfile
+    from .cwlprov.ro import ResearchObject
     from .mutation import MutationManager
     from .process import Process
-    from .provenance import ResearchObject
-    from .provenance_profile import ProvenanceProfile
     from .secrets import SecretStore
     from .software_requirements import DependenciesConfiguration
 
 
@@ -0,0 +1,168 @@
+"""Stores Research Object including provenance."""
+
+import hashlib
+import os
+import pwd
+import re
+import uuid
+from getpass import getuser
+from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union
+
+from typing_extensions import TypedDict
+
+
+def _whoami() -> Tuple[str, str]:
+    """Return the current operating system account as (username, fullname)."""
+    username = getuser()
+    try:
+        fullname = pwd.getpwuid(os.getuid())[4].split(",")[0]
+    except (KeyError, IndexError):
+        fullname = username
+
+    return (username, fullname)
+
+
+def _check_mod_11_2(numeric_string: str) -> bool:
+    """
+    Validate numeric_string for its MOD-11-2 checksum.
+
+    Any "-" in the numeric_string are ignored.
+
+    The last digit of numeric_string is assumed to be the checksum, 0-9 or X.
+
+    See ISO/IEC 7064:2003 and
+    https://support.orcid.org/knowledgebase/articles/116780-structure-of-the-orcid-identifier
+    """
+    # Strip -
+    nums = numeric_string.replace("-", "")
+    total = 0
+    # skip last (check)digit
+    for num in nums[:-1]:
+        digit = int(num)
+        total = (total + digit) * 2
+    remainder = total % 11
+    result = (12 - remainder) % 11
+    if result == 10:
+        checkdigit = "X"
+    else:
+        checkdigit = str(result)
+    # Compare against last digit or X
+    return nums[-1].upper() == checkdigit
+
+
+def _valid_orcid(orcid: Optional[str]) -> str:
+    """
+    Ensure orcid is a valid ORCID identifier.
+
+    The string must be equivalent to one of these forms:
+
+    0000-0002-1825-0097
+    orcid.org/0000-0002-1825-0097
+    http://orcid.org/0000-0002-1825-0097
+    https://orcid.org/0000-0002-1825-0097
+
+    If the ORCID number or prefix is invalid, a ValueError is raised.
+
+    The returned ORCID string is always in the form of:
+    https://orcid.org/0000-0002-1825-0097
+    """
+    if orcid is None or not orcid:
+        raise ValueError("ORCID cannot be unspecified")
+    # Liberal in what we consume, e.g. ORCID.org/0000-0002-1825-009x
+    orcid = orcid.lower()
+    match = re.match(
+        # Note: concatenated r"" r"" below so we can add comments to pattern
+        # Optional hostname, with or without protocol
+        r"(http://orcid\.org/|https://orcid\.org/|orcid\.org/)?"
+        # alternative pattern, but probably messier
+        # r"^((https?://)?orcid.org/)?"
+        # ORCID number is always 4x4 numerical digits,
+        # but last digit (modulus 11 checksum)
+        # can also be X (but we made it lowercase above).
+        # e.g. 0000-0002-1825-0097
+        # or   0000-0002-1694-233x
+        r"(?P<orcid>(\d{4}-\d{4}-\d{4}-\d{3}[0-9x]))$",
+        orcid,
+    )
+
+    help_url = (
+        "https://support.orcid.org/knowledgebase/articles/"
+        "116780-structure-of-the-orcid-identifier"
+    )
+    if not match:
+        raise ValueError(f"Invalid ORCID: {orcid}\n{help_url}")
+
+    # Conservative in what we produce:
+    # a) Ensure any checksum digit is uppercase
+    orcid_num = match.group("orcid").upper()
+    # b) ..and correct
+    if not _check_mod_11_2(orcid_num):
+        raise ValueError(f"Invalid ORCID checksum: {orcid_num}\n{help_url}")
+
+    # c) Re-add the official prefix https://orcid.org/
+    return "https://orcid.org/%s" % orcid_num
+
+
+Annotation = TypedDict(
+    "Annotation",
+    {
+        "uri": str,
+        "about": str,
+        "content": Optional[Union[str, List[str]]],
+        "oa:motivatedBy": Dict[str, str],
+    },
+)
+Aggregate = TypedDict(
+    "Aggregate",
+    {
+        "uri": Optional[str],
+        "bundledAs": Optional[Dict[str, Any]],
+        "mediatype": Optional[str],
+        "conformsTo": Optional[Union[str, List[str]]],
+        "createdOn": Optional[str],
+        "createdBy": Optional[Dict[str, str]],
+    },
+    total=False,
+)
+# Aggregate.bundledAs is actually type Aggregate, but cyclic definitions are not supported
+AuthoredBy = TypedDict(
+    "AuthoredBy",
+    {"orcid": Optional[str], "name": Optional[str], "uri": Optional[str]},
+    total=False,
+)
+
+
+def checksum_copy(
+    src_file: IO[Any],
+    dst_file: Optional[IO[Any]] = None,
+    hasher: Optional[Callable[[], "hashlib._Hash"]] = None,
+    buffersize: int = 1024 * 1024,
+) -> str:
+    """Compute checksums while copying a file."""
+    # TODO: Use hashlib.new(Hasher_str) instead?
+    if hasher:
+        checksum = hasher()
+    else:
+        from .provenance_constants import Hasher
+
+        checksum = Hasher()
+    contents = src_file.read(buffersize)
+    if dst_file and hasattr(dst_file, "name") and hasattr(src_file, "name"):
+        temp_location = os.path.join(os.path.dirname(dst_file.name), str(uuid.uuid4()))
+        try:
+            os.rename(dst_file.name, temp_location)
+            os.link(src_file.name, dst_file.name)
+            dst_file = None
+            os.unlink(temp_location)
+        except OSError:
+            pass
+        if os.path.exists(temp_location):
+            os.rename(temp_location, dst_file.name)  # type: ignore
+    while contents != b"":
+        if dst_file is not None:
+            dst_file.write(contents)
+        checksum.update(contents)
+        contents = src_file.read(buffersize)
+    if dst_file is not None:
+        dst_file.flush()
+    return checksum.hexdigest().lower()
@@ -38,7 +38,7 @@
 
 # BagIt and YAML always use UTF-8
 ENCODING = "UTF-8"
-TEXT_PLAIN = 'text/plain; charset="%s"' % ENCODING
+TEXT_PLAIN = f"text/plain; charset={ENCODING!r}"
 
 # sha1, compatible with the File type's "checksum" field
 # e.g. "checksum" = "sha1$47a013e660d408619d894b20806b1d5086aab03b"
 
@@ -24,10 +24,13 @@
 from prov.model import PROV, PROV_LABEL, PROV_TYPE, PROV_VALUE, ProvDocument, ProvEntity
 from schema_salad.sourceline import SourceLine
 
-from .errors import WorkflowException
-from .job import CommandLineJob, JobBase
-from .loghandler import _logger
-from .process import Process, shortname
+from ..errors import WorkflowException
+from ..job import CommandLineJob, JobBase
+from ..loghandler import _logger
+from ..process import Process, shortname
+from ..stdfsaccess import StdFsAccess
+from ..utils import CWLObjectType, JobsType, get_listing, posix_path, versionstring
+from ..workflow_job import WorkflowJob
 from .provenance_constants import (
     ACCOUNT_UUID,
     CWLPROV,
@@ -46,12 +49,10 @@
     WFDESC,
     WFPROV,
 )
-from .stdfsaccess import StdFsAccess
-from .utils import CWLObjectType, JobsType, get_listing, posix_path, versionstring
-from .workflow_job import WorkflowJob
+from .writablebagfile import create_job, write_bag_file  # change this later
 
 if TYPE_CHECKING:
-    from .provenance import ResearchObject
+    from .ro import ResearchObject
 
 
 def copy_job_order(job: Union[Process, JobsType], job_order_object: CWLObjectType) -> CWLObjectType:
@@ -114,10 +115,7 @@ def __init__(
 
     def __str__(self) -> str:
         """Represent this Provenvance profile as a string."""
-        return "ProvenanceProfile <{}> in <{}>".format(
-            self.workflow_run_uri,
-            self.research_object,
-        )
+        return f"ProvenanceProfile <{self.workflow_run_uri}> in <{self.research_object}>"
 
     def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
         """Add basic namespaces."""
@@ -140,7 +138,7 @@ def host_provenance(document: ProvDocument) -> None:
                 },
             )
 
-        self.cwltool_version = "cwltool %s" % versionstring().split()[-1]
+        self.cwltool_version = f"cwltool {versionstring().split()[-1]}"
         self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#")
         # document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
         self.document.add_namespace("wfdesc", "http://purl.org/wf4ever/wfdesc#")
@@ -240,7 +238,7 @@ def evaluate(
             self.prospective_prov(job)
             customised_job = copy_job_order(job, job_order_object)
             self.used_artefacts(customised_job, self.workflow_run_uri)
-            research_obj.create_job(customised_job)
+            create_job(research_obj, customised_job)
         elif hasattr(job, "workflow"):
             # record provenance of workflow executions
             self.prospective_prov(job)
@@ -460,7 +458,7 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity:
         ore_doc.add_bundle(dir_bundle)
         ore_doc = ore_doc.flattened()
         ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn))
-        with self.research_object.write_bag_file(ore_doc_path) as provenance_file:
+        with write_bag_file(self.research_object, ore_doc_path) as provenance_file:
             ore_doc.serialize(provenance_file, format="rdf", rdf_format="turtle")
         self.research_object.add_annotation(dir_id, [ore_doc_fn], ORE["isDescribedBy"].uri)
 
@@ -477,7 +475,7 @@ def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
         data_file = self.research_object.add_data_file(byte_s, content_type=TEXT_PLAIN)
         checksum = PurePosixPath(data_file).name
         # FIXME: Don't naively assume add_data_file uses hash in filename!
-        data_id = "data:%s" % PurePosixPath(data_file).stem
+        data_id = f"data:{PurePosixPath(data_file).stem}"
         entity = self.document.entity(
             data_id, {PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value)}
         )
@@ -509,7 +507,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
             byte_s = BytesIO(value)
             data_file = self.research_object.add_data_file(byte_s)
             # FIXME: Don't naively assume add_data_file uses hash in filename!
-            data_id = "data:%s" % PurePosixPath(data_file).stem
+            data_id = f"data:{PurePosixPath(data_file).stem}"
             return self.document.entity(
                 data_id,
                 {PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value)},
@@ -654,7 +652,7 @@ def generate_output_prov(
                     # FIXME: Probably not "main" in nested workflows
                     role = self.wf_ns[f"main/{name}/{output}"]
                 else:
-                    role = self.wf_ns["main/%s" % output]
+                    role = self.wf_ns[f"main/{output}"]
 
                 if not process_run_id:
                     process_run_id = self.workflow_run_uri
@@ -738,38 +736,38 @@ def finalize_prov_profile(self, name: Optional[str]) -> List[QualifiedName]:
         prov_ids = []
 
         # https://www.w3.org/TR/prov-xml/
-        with self.research_object.write_bag_file(basename + ".xml") as provenance_file:
+        with write_bag_file(self.research_object, basename + ".xml") as provenance_file:
             self.document.serialize(provenance_file, format="xml", indent=4)
             prov_ids.append(self.provenance_ns[filename + ".xml"])
 
         # https://www.w3.org/TR/prov-n/
-        with self.research_object.write_bag_file(basename + ".provn") as provenance_file:
+        with write_bag_file(self.research_object, basename + ".provn") as provenance_file:
             self.document.serialize(provenance_file, format="provn", indent=2)
             prov_ids.append(self.provenance_ns[filename + ".provn"])
 
         # https://www.w3.org/Submission/prov-json/
-        with self.research_object.write_bag_file(basename + ".json") as provenance_file:
+        with write_bag_file(self.research_object, basename + ".json") as provenance_file:
             self.document.serialize(provenance_file, format="json", indent=2)
             prov_ids.append(self.provenance_ns[filename + ".json"])
 
         # "rdf" aka https://www.w3.org/TR/prov-o/
         # which can be serialized to ttl/nt/jsonld (and more!)
 
         # https://www.w3.org/TR/turtle/
-        with self.research_object.write_bag_file(basename + ".ttl") as provenance_file:
+        with write_bag_file(self.research_object, basename + ".ttl") as provenance_file:
             self.document.serialize(provenance_file, format="rdf", rdf_format="turtle")
             prov_ids.append(self.provenance_ns[filename + ".ttl"])
 
         # https://www.w3.org/TR/n-triples/
-        with self.research_object.write_bag_file(basename + ".nt") as provenance_file:
+        with write_bag_file(self.research_object, basename + ".nt") as provenance_file:
             self.document.serialize(provenance_file, format="rdf", rdf_format="ntriples")
             prov_ids.append(self.provenance_ns[filename + ".nt"])
 
         # https://www.w3.org/TR/json-ld/
         # TODO: Use a nice JSON-LD context
         # see also https://eprints.soton.ac.uk/395985/
         # 404 Not Found on https://provenance.ecs.soton.ac.uk/prov.jsonld :(
-        with self.research_object.write_bag_file(basename + ".jsonld") as provenance_file:
+        with write_bag_file(self.research_object, basename + ".jsonld") as provenance_file:
             self.document.serialize(provenance_file, format="rdf", rdf_format="json-ld")
             prov_ids.append(self.provenance_ns[filename + ".jsonld"])
Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,9 @@`
`87`	`87`	`)`
`88`	`88`
`89`	`89`	`if TYPE_CHECKING:`
`90`		`- from .provenance_profile import ProvenanceProfile # pylint: disable=unused-import`
	`90`	`+ from .cwlprov.provenance_profile import (`
	`91`	`+ ProvenanceProfile, # pylint: disable=unused-import`
	`92`	`+ )`
`91`	`93`
`92`	`94`
`93`	`95`	`class PathCheckingMode(Enum):`