diff --git a/.gitignore b/.gitignore index fbe4b24fc..b4cab0e66 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ eggs/ *.egg .tox/ .pytest_cache +*.so # Editor Temps .*.sw? diff --git a/cwltool/context.py b/cwltool/context.py index 237a90968..bb281fd88 100644 --- a/cwltool/context.py +++ b/cwltool/context.py @@ -183,6 +183,8 @@ def __init__(self, kwargs: Optional[dict[str, Any]] = None) -> None: self.orcid: str = "" self.cwl_full_name: str = "" self.process_run_id: Optional[str] = None + self.prov_host: bool = False + self.prov_user: bool = False self.prov_obj: Optional[ProvenanceProfile] = None self.mpi_config: MpiConfig = MpiConfig() self.default_stdout: Optional[Union[IO[bytes], TextIO]] = None diff --git a/cwltool/cwlprov/provenance_profile.py b/cwltool/cwlprov/provenance_profile.py index d4dfd6cb4..e8538e51b 100644 --- a/cwltool/cwlprov/provenance_profile.py +++ b/cwltool/cwlprov/provenance_profile.py @@ -6,7 +6,6 @@ from collections.abc import MutableMapping, MutableSequence, Sequence from io import BytesIO from pathlib import PurePath, PurePosixPath -from socket import getfqdn from typing import TYPE_CHECKING, Any, Optional, Union, cast from prov.identifier import Identifier, QualifiedName @@ -24,12 +23,10 @@ ACCOUNT_UUID, CWLPROV, ENCODING, - FOAF, METADATA, ORE, PROVENANCE, RO, - SCHEMA, SHA1, SHA256, TEXT_PLAIN, @@ -108,25 +105,6 @@ def __str__(self) -> str: def generate_prov_doc(self) -> tuple[str, ProvDocument]: """Add basic namespaces.""" - - def host_provenance(document: ProvDocument) -> None: - """Record host provenance.""" - document.add_namespace(CWLPROV) - document.add_namespace(UUID) - document.add_namespace(FOAF) - - hostname = getfqdn() - # won't have a foaf:accountServiceHomepage for unix hosts, but - # we can at least provide hostname - document.agent( - ACCOUNT_UUID, - { - PROV_TYPE: FOAF["OnlineAccount"], - "prov:location": hostname, - CWLPROV["hostname"]: hostname, - }, - ) - self.cwltool_version = f"cwltool {versionstring().split()[-1]}" self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#") # document.add_namespace('prov', 'http://www.w3.org/ns/prov#') @@ -165,25 +143,10 @@ def host_provenance(document: ProvDocument) -> None: # .. but we always know cwltool was launched (directly or indirectly) # by a user account, as cwltool is a command line tool account = self.document.agent(ACCOUNT_UUID) - if self.orcid or self.full_name: - person: dict[Union[str, Identifier], Any] = { - PROV_TYPE: PROV["Person"], - "prov:type": SCHEMA["Person"], - } - if self.full_name: - person["prov:label"] = self.full_name - person["foaf:name"] = self.full_name - person["schema:name"] = self.full_name - else: - # TODO: Look up name from ORCID API? - pass - agent = self.document.agent(self.orcid or uuid.uuid4().urn, person) - self.document.actedOnBehalfOf(account, agent) - else: - if self.host_provenance: - host_provenance(self.document) - if self.user_provenance: - self.research_object.user_provenance(self.document) + if self.host_provenance: + self.research_object.host_provenance(self.document) + if self.user_provenance or self.orcid or self.full_name: + self.research_object.user_provenance(self.document) # The execution of cwltool wfengine = self.document.agent( self.engine_uuid, diff --git a/cwltool/cwlprov/ro.py b/cwltool/cwlprov/ro.py index ac60afc92..f58919a6b 100644 --- a/cwltool/cwlprov/ro.py +++ b/cwltool/cwlprov/ro.py @@ -9,10 +9,11 @@ import uuid from collections.abc import MutableMapping, MutableSequence from pathlib import Path, PurePosixPath -from typing import IO, Any, Optional, Union, cast +from socket import getfqdn +from typing import IO, TYPE_CHECKING, Any, Optional, Union, cast import prov.model as provM -from prov.model import PROV, ProvDocument +from prov.model import ProvDocument from ..loghandler import _logger from ..stdfsaccess import StdFsAccess @@ -27,6 +28,7 @@ from . import Aggregate, Annotation, AuthoredBy, _valid_orcid, _whoami, checksum_copy from .provenance_constants import ( ACCOUNT_UUID, + CWLPROV, CWLPROV_VERSION, DATA, ENCODING, @@ -35,6 +37,7 @@ METADATA, ORCID, PROVENANCE, + SCHEMA, SHA1, SHA256, SHA512, @@ -46,6 +49,9 @@ Hasher, ) +if TYPE_CHECKING: + from .provenance_profile import ProvenanceProfile # pylint: disable=unused-import + class ResearchObject: """CWLProv Research Object.""" @@ -82,6 +88,34 @@ def __init__( self._initialize() _logger.debug("[provenance] Temporary research object: %s", self.folder) + def initialize_provenance( + self, + full_name: str, + host_provenance: bool, + user_provenance: bool, + orcid: str, + fsaccess: StdFsAccess, + run_uuid: Optional[uuid.UUID] = None, + ) -> "ProvenanceProfile": + """ + Provide a provenance profile initialization hook function. + + Allows overriding the default strategy to define the + provenance profile concepts and associations to extend + details as needed. + """ + from .provenance_profile import ProvenanceProfile + + return ProvenanceProfile( + research_object=self, + full_name=full_name, + host_provenance=host_provenance, + user_provenance=user_provenance, + orcid=orcid, + fsaccess=fsaccess, + run_uuid=run_uuid, + ) + def self_check(self) -> None: """Raise ValueError if this RO is closed.""" if self.closed: @@ -117,10 +151,22 @@ def _initialize_bagit(self) -> None: bag_it_file.write("BagIt-Version: 0.97\n") bag_it_file.write(f"Tag-File-Character-Encoding: {ENCODING}\n") + def resolve_user(self) -> tuple[str, str]: + """ + Provide a user provenance hook function. + + Allows overriding the default strategy to retrieve user provenance + in case the calling code can provide a better resolution. + The function must return a tuple of the (username, fullname) + that identifies the user. This user will be applied on top + to any provided ORCID or fullname by agent association. + """ + return _whoami() + def user_provenance(self, document: ProvDocument) -> None: """Add the user provenance.""" self.self_check() - (username, fullname) = _whoami() + (username, fullname) = self.resolve_user() if not self.full_name: self.full_name = fullname @@ -132,19 +178,21 @@ def user_provenance(self, document: ProvDocument) -> None: ACCOUNT_UUID, { provM.PROV_TYPE: FOAF["OnlineAccount"], - "prov:label": username, + provM.PROV_LABEL: username, FOAF["accountName"]: username, }, ) user = document.agent( self.orcid or USER_UUID, - { - provM.PROV_TYPE: PROV["Person"], - "prov:label": self.full_name, - FOAF["name"]: self.full_name, - FOAF["account"]: account, - }, + [ + (provM.PROV_TYPE, SCHEMA["Person"]), + (provM.PROV_TYPE, provM.PROV["Person"]), + (provM.PROV_LABEL, self.full_name), + (FOAF["name"], self.full_name), + (FOAF["account"], account), + (SCHEMA["name"], self.full_name), + ], ) # cwltool may be started on the shell (directly by user), # by shell script (indirectly by user) @@ -156,6 +204,35 @@ def user_provenance(self, document: ProvDocument) -> None: # get their name wrong!) document.actedOnBehalfOf(account, user) + def resolve_host(self) -> tuple[str, str]: + """ + Provide a host provenance hook function. + + Allows overriding the default strategy to retrieve host provenance + in case the calling code can provide a better resolution. + The function must return a tuple of the (fqdn, uri) that identifies the host. + """ + fqdn = getfqdn() + return fqdn, fqdn # allow for (fqdn, uri) to be distinct, but the same by default + + def host_provenance(self, document: ProvDocument) -> None: + """Record host provenance.""" + document.add_namespace(CWLPROV) + document.add_namespace(UUID) + document.add_namespace(FOAF) + + hostname, uri = self.resolve_host() + # won't have a foaf:accountServiceHomepage for unix hosts, but + # we can at least provide hostname + document.agent( + ACCOUNT_UUID, + { + provM.PROV_TYPE: FOAF["OnlineAccount"], + provM.PROV_LOCATION: uri, + CWLPROV["hostname"]: hostname, + }, + ) + def add_tagfile(self, path: str, timestamp: Optional[datetime.datetime] = None) -> None: """Add tag files to our research object.""" self.self_check() diff --git a/cwltool/executors.py b/cwltool/executors.py index e25426c9d..33198d854 100644 --- a/cwltool/executors.py +++ b/cwltool/executors.py @@ -19,7 +19,6 @@ from .command_line_tool import CallbackJob, ExpressionJob from .context import RuntimeContext, getdefault from .cuda import cuda_version_and_device_count -from .cwlprov.provenance_profile import ProvenanceProfile from .errors import WorkflowException from .job import JobBase from .loghandler import _logger @@ -194,11 +193,13 @@ def run_jobs( # define provenance profile for single commandline tool if not isinstance(process, Workflow) and runtime_context.research_obj is not None: - process.provenance_object = ProvenanceProfile( - runtime_context.research_obj, + process.provenance_object = runtime_context.research_obj.initialize_provenance( full_name=runtime_context.cwl_full_name, - host_provenance=False, - user_provenance=False, + # following are only set from main when directly command line tool + # when nested in a workflow, they should be disabled since they would + # already have been provided/initialized by the parent workflow prov-obj + host_provenance=runtime_context.prov_host, + user_provenance=runtime_context.prov_user, orcid=runtime_context.orcid, # single tool execution, so RO UUID = wf UUID = tool UUID run_uuid=runtime_context.research_obj.ro_uuid, diff --git a/cwltool/main.py b/cwltool/main.py index b7ba40d40..a137d8a4f 100755 --- a/cwltool/main.py +++ b/cwltool/main.py @@ -1065,6 +1065,11 @@ def main( loadingContext = setup_loadingContext(loadingContext, runtimeContext, args) + if loadingContext.research_obj: + # early forward parameters required for a single command line tool + runtimeContext.prov_host = loadingContext.host_provenance + runtimeContext.prov_user = loadingContext.user_provenance + uri, tool_file_uri = resolve_tool_uri( args.workflow, resolver=loadingContext.resolver, diff --git a/cwltool/workflow.py b/cwltool/workflow.py index 3bf32251f..899ac4643 100644 --- a/cwltool/workflow.py +++ b/cwltool/workflow.py @@ -72,8 +72,7 @@ def __init__( if is_main: run_uuid = loadingContext.research_obj.ro_uuid - self.provenance_object = ProvenanceProfile( - loadingContext.research_obj, + self.provenance_object = loadingContext.research_obj.initialize_provenance( full_name=loadingContext.cwl_full_name, host_provenance=loadingContext.host_provenance, user_provenance=loadingContext.user_provenance, diff --git a/mypy-stubs/rdflib/graph.pyi b/mypy-stubs/rdflib/graph.pyi index d3e6f2f54..9764972b2 100644 --- a/mypy-stubs/rdflib/graph.pyi +++ b/mypy-stubs/rdflib/graph.pyi @@ -16,7 +16,7 @@ from rdflib import query from rdflib.collection import Collection from rdflib.paths import Path from rdflib.resource import Resource -from rdflib.term import BNode, Identifier, Node +from rdflib.term import BNode, Identifier, Literal, Node class Graph(Node): base: Any = ... @@ -66,7 +66,7 @@ class Graph(Node): ) -> Iterable[Node]: ... def objects( self, subject: Optional[Any] = ..., predicate: Optional[Any] = ... - ) -> Iterable[Identifier]: ... + ) -> Iterable[Union[Identifier, Literal]]: ... def subject_predicates(self, object: Optional[Any] = ...) -> None: ... def subject_objects(self, predicate: Optional[Any] = ...) -> None: ... def predicate_objects(self, subject: Optional[Any] = ...) -> None: ... diff --git a/tests/test_provenance.py b/tests/test_provenance.py index e8d8416be..d7a2a698b 100644 --- a/tests/test_provenance.py +++ b/tests/test_provenance.py @@ -32,12 +32,23 @@ SCHEMA = Namespace("http://schema.org/") CWLPROV = Namespace("https://w3id.org/cwl/prov#") OA = Namespace("http://www.w3.org/ns/oa#") +FOAF = Namespace("http://xmlns.com/foaf/0.1/") -def cwltool(tmp_path: Path, *args: Any) -> Path: +TEST_ORCID = "https://orcid.org/0000-0003-4862-3349" + + +def cwltool(tmp_path: Path, *args: Any, with_orcid: bool = False) -> Path: prov_folder = tmp_path / "provenance" prov_folder.mkdir() - new_args = ["--provenance", str(prov_folder)] + new_args = [ + "--enable-user-provenance", + "--enable-host-provenance", + "--provenance", + str(prov_folder), + ] + if with_orcid: + new_args.extend(["--orcid", TEST_ORCID]) new_args.extend(args) # Run within a temporary directory to not pollute git checkout tmp_dir = tmp_path / "cwltool-run" @@ -49,61 +60,81 @@ def cwltool(tmp_path: Path, *args: Any) -> Path: @needs_docker -def test_hello_workflow(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_hello_workflow(tmp_path: Path, with_orcid: bool) -> None: check_provenance( cwltool( tmp_path, get_data("tests/wf/hello-workflow.cwl"), "--usermessage", "Hello workflow", - ) + with_orcid=with_orcid, + ), + with_orcid=with_orcid, ) @needs_docker -def test_hello_single_tool(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_hello_single_tool(tmp_path: Path, with_orcid: bool) -> None: check_provenance( cwltool( tmp_path, get_data("tests/wf/hello_single_tool.cwl"), "--message", "Hello tool", + with_orcid=with_orcid, ), single_tool=True, + with_orcid=with_orcid, ) @needs_docker -def test_revsort_workflow(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_revsort_workflow(tmp_path: Path, with_orcid: bool) -> None: folder = cwltool( tmp_path, get_data("tests/wf/revsort.cwl"), get_data("tests/wf/revsort-job.json"), + with_orcid=with_orcid, ) check_output_object(folder) - check_provenance(folder) + check_provenance(folder, with_orcid=with_orcid) @needs_docker -def test_revsort_workflow_shortcut(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_revsort_workflow_shortcut(tmp_path: Path, with_orcid: bool) -> None: """Confirm that using 'cwl:tool' shortcut still snapshots the CWL files.""" folder = cwltool( tmp_path, get_data("tests/wf/revsort-job-shortcut.json"), + with_orcid=with_orcid, ) check_output_object(folder) - check_provenance(folder) + check_provenance(folder, with_orcid=with_orcid) assert not (folder / "snapshot" / "revsort-job-shortcut.json").exists() assert len(list((folder / "snapshot").iterdir())) == 4 @needs_docker -def test_nested_workflow(tmp_path: Path) -> None: - check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True) +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_nested_workflow(tmp_path: Path, with_orcid: bool) -> None: + check_provenance( + cwltool( + tmp_path, + get_data("tests/wf/nested.cwl"), + with_orcid=with_orcid, + ), + nested=True, + with_orcid=with_orcid, + ) @needs_docker -def test_secondary_files_implicit(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_secondary_files_implicit(tmp_path: Path, with_orcid: bool) -> None: file1 = tmp_path / "foo1.txt" file1idx = tmp_path / "foo1.txt.idx" @@ -113,13 +144,20 @@ def test_secondary_files_implicit(tmp_path: Path) -> None: f.write("bar") # secondary will be picked up by .idx - folder = cwltool(tmp_path, get_data("tests/wf/sec-wf.cwl"), "--file1", str(file1)) - check_provenance(folder, secondary_files=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/sec-wf.cwl"), + "--file1", + str(file1), + with_orcid=with_orcid, + ) + check_provenance(folder, secondary_files=True, with_orcid=with_orcid) check_secondary_files(folder) @needs_docker -def test_secondary_files_explicit(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_secondary_files_explicit(tmp_path: Path, with_orcid: bool) -> None: # Deliberately do NOT have common basename or extension file1dir = tmp_path / "foo" file1dir.mkdir() @@ -154,22 +192,33 @@ def test_secondary_files_explicit(tmp_path: Path) -> None: j = json.dumps(job, ensure_ascii=True) fp.write(j.encode("ascii")) - folder = cwltool(tmp_path, get_data("tests/wf/sec-wf.cwl"), str(jobJson)) - check_provenance(folder, secondary_files=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/sec-wf.cwl"), + str(jobJson), + with_orcid=with_orcid, + ) + check_provenance(folder, secondary_files=True, with_orcid=with_orcid) check_secondary_files(folder) @needs_docker -def test_secondary_files_output(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_secondary_files_output(tmp_path: Path, with_orcid: bool) -> None: # secondary will be picked up by .idx - folder = cwltool(tmp_path, get_data("tests/wf/sec-wf-out.cwl")) - check_provenance(folder, secondary_files=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/sec-wf-out.cwl"), + with_orcid=with_orcid, + ) + check_provenance(folder, secondary_files=True, with_orcid=with_orcid) # Skipped, not the same secondary files as above # self.check_secondary_files() @needs_docker -def test_directory_workflow(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_directory_workflow(tmp_path: Path, with_orcid: bool) -> None: dir2 = tmp_path / "dir2" dir2.mkdir() sha1 = { @@ -185,8 +234,14 @@ def test_directory_workflow(tmp_path: Path) -> None: with open(dir2 / x, "w", encoding="ascii") as f: f.write(x) - folder = cwltool(tmp_path, get_data("tests/wf/directory.cwl"), "--dir", str(dir2)) - check_provenance(folder, directory=True) + folder = cwltool( + tmp_path, + get_data("tests/wf/directory.cwl"), + "--dir", + str(dir2), + with_orcid=with_orcid, + ) + check_provenance(folder, directory=True, with_orcid=with_orcid) # Output should include ls stdout of filenames a b c on each line file_list = ( @@ -209,10 +264,12 @@ def test_directory_workflow(tmp_path: Path) -> None: @needs_docker -def test_no_data_files(tmp_path: Path) -> None: +@pytest.mark.parametrize("with_orcid", [True, False]) +def test_no_data_files(tmp_path: Path, with_orcid: bool) -> None: folder = cwltool( tmp_path, get_data("tests/wf/conditional_step_no_inputs.cwl"), + with_orcid=with_orcid, ) check_bagit(folder) @@ -263,6 +320,7 @@ def check_provenance( single_tool: bool = False, directory: bool = False, secondary_files: bool = False, + with_orcid: bool = False, ) -> None: check_folders(base_path) check_bagit(base_path) @@ -273,6 +331,7 @@ def check_provenance( single_tool=single_tool, directory=directory, secondary_files=secondary_files, + with_orcid=with_orcid, ) @@ -463,6 +522,7 @@ def check_prov( single_tool: bool = False, directory: bool = False, secondary_files: bool = False, + with_orcid: bool = False, ) -> None: prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt" assert prov_file.is_file(), f"Can't find {prov_file}" @@ -485,7 +545,6 @@ def check_prov( # the has_provenance annotations in manifest.json instead # run should have been started by a wf engine - engines = set(g.subjects(RDF.type, WFPROV.WorkflowEngine)) assert engines, "Could not find WorkflowEngine" assert len(engines) == 1, "Found too many WorkflowEngines: %s" % engines @@ -502,6 +561,39 @@ def check_prov( PROV.SoftwareAgent, ) in g, "Engine not declared as SoftwareAgent" + # run should be associated to the user + accounts = set(g.subjects(RDF.type, FOAF.OnlineAccount)) + assert len(accounts) == 1 + account = accounts.pop() + people = set(g.subjects(RDF.type, SCHEMA.Person)) + assert len(people) == 1, "Can't find associated person in workflow run" + person = people.pop() + if with_orcid: + assert person == URIRef(TEST_ORCID) + else: + account_names = set(g.objects(account, FOAF.accountName)) + assert len(account_names) == 1 + account_name = cast(Literal, account_names.pop()) + machine_user = provenance._whoami()[0] + assert account_name.value == machine_user + + # find the random UUID assigned to cwltool + tool_agents = set(g.subjects(RDF.type, PROV.SoftwareAgent)) + n_all_agents = 2 + len(tool_agents) + agents = set(g.subjects(RDF.type, PROV.Agent)) + assert ( + len(agents) == n_all_agents + ), "There should be 1 agent per tool (engine), 1 user agent, and 1 cwltool agent" + agents.remove(person) + agents.remove(engine) # the main tool + remain_agents = agents - tool_agents + assert len(remain_agents) == 1 + assert ( + account, + PROV.actedOnBehalfOf, + person, + ) in g, "Association of cwltool agent acting for user is missing" + if single_tool: activities = set(g.subjects(RDF.type, PROV.Activity)) assert len(activities) == 1, "Too many activities: %s" % activities