test with/without orcid prov + fix missing user prov when running CommandLineTool directly

fmigneault · fmigneault · commit 78ef52de1914 · 2024-12-11T22:37:23.000-05:00
diff --git a/cwltool/context.py b/cwltool/context.py
@@ -183,6 +183,8 @@ def __init__(self, kwargs: Optional[dict[str, Any]] = None) -> None:
         self.orcid: str = ""
         self.cwl_full_name: str = ""
         self.process_run_id: Optional[str] = None
+        self.prov_host: bool = False
+        self.prov_user: bool = False
         self.prov_obj: Optional[ProvenanceProfile] = None
         self.mpi_config: MpiConfig = MpiConfig()
         self.default_stdout: Optional[Union[IO[bytes], TextIO]] = None
diff --git a/cwltool/cwlprov/provenance_profile.py b/cwltool/cwlprov/provenance_profile.py
@@ -27,7 +27,6 @@
     ORE,
     PROVENANCE,
     RO,
-    SCHEMA,
     SHA1,
     SHA256,
     TEXT_PLAIN,
@@ -144,25 +143,10 @@ def generate_prov_doc(self) -> tuple[str, ProvDocument]:
         # .. but we always know cwltool was launched (directly or indirectly)
         # by a user account, as cwltool is a command line tool
         account = self.document.agent(ACCOUNT_UUID)
-        if self.orcid or self.full_name:
-            person: dict[Union[str, Identifier], Any] = {
-                PROV_TYPE: PROV["Person"],
-                "prov:type": SCHEMA["Person"],
-            }
-            if self.full_name:
-                person["prov:label"] = self.full_name
-                person["foaf:name"] = self.full_name
-                person["schema:name"] = self.full_name
-            else:
-                # TODO: Look up name from ORCID API?
-                pass
-            agent = self.document.agent(self.orcid or uuid.uuid4().urn, person)
-            self.document.actedOnBehalfOf(account, agent)
-        else:
-            if self.host_provenance:
-                self.research_object.host_provenance(self.document)
-            if self.user_provenance:
-                self.research_object.user_provenance(self.document)
+        if self.host_provenance:
+            self.research_object.host_provenance(self.document)
+        if self.user_provenance or self.orcid or self.full_name:
+            self.research_object.user_provenance(self.document)
         # The execution of cwltool
         wfengine = self.document.agent(
             self.engine_uuid,
diff --git a/cwltool/cwlprov/ro.py b/cwltool/cwlprov/ro.py
@@ -37,6 +37,7 @@
     METADATA,
     ORCID,
     PROVENANCE,
+    SCHEMA,
     SHA1,
     SHA256,
     SHA512,
@@ -184,12 +185,14 @@ def user_provenance(self, document: ProvDocument) -> None:
 
         user = document.agent(
             self.orcid or USER_UUID,
-            {
-                provM.PROV_TYPE: provM.PROV["Person"],
-                provM.PROV_LABEL: self.full_name,
-                FOAF["name"]: self.full_name,
-                FOAF["account"]: account,
-            },
+            [
+                (provM.PROV_TYPE, SCHEMA["Person"]),
+                (provM.PROV_TYPE, provM.PROV["Person"]),
+                (provM.PROV_LABEL, self.full_name),
+                (FOAF["name"], self.full_name),
+                (FOAF["account"], account),
+                (SCHEMA["name"], self.full_name),
+            ],
         )
         # cwltool may be started on the shell (directly by user),
         # by shell script (indirectly by user)
diff --git a/cwltool/executors.py b/cwltool/executors.py
@@ -195,8 +195,11 @@ def run_jobs(
         if not isinstance(process, Workflow) and runtime_context.research_obj is not None:
             process.provenance_object = runtime_context.research_obj.initialize_provenance(
                 full_name=runtime_context.cwl_full_name,
-                host_provenance=False,
-                user_provenance=False,
+                # following are only set from main when directly command line tool
+                # when nested in a workflow, they should be disabled since they would
+                # already have been provided/initialized by the parent workflow prov-obj
+                host_provenance=runtime_context.prov_host,
+                user_provenance=runtime_context.prov_user,
                 orcid=runtime_context.orcid,
                 # single tool execution, so RO UUID = wf UUID = tool UUID
                 run_uuid=runtime_context.research_obj.ro_uuid,
diff --git a/cwltool/main.py b/cwltool/main.py
@@ -1060,6 +1060,11 @@ def main(
 
         loadingContext = setup_loadingContext(loadingContext, runtimeContext, args)
 
+        if loadingContext.research_obj:
+            # early forward parameters required for a single command line tool
+            runtimeContext.prov_host = loadingContext.host_provenance
+            runtimeContext.prov_user = loadingContext.user_provenance
+
         uri, tool_file_uri = resolve_tool_uri(
             args.workflow,
             resolver=loadingContext.resolver,
diff --git a/tests/test_provenance.py b/tests/test_provenance.py
@@ -32,22 +32,23 @@
 SCHEMA = Namespace("http://schema.org/")
 CWLPROV = Namespace("https://w3id.org/cwl/prov#")
 OA = Namespace("http://www.w3.org/ns/oa#")
+FOAF = Namespace("http://xmlns.com/foaf/0.1/")
 
 
 TEST_ORCID = "https://orcid.org/0000-0003-4862-3349"
 
 
-def cwltool(tmp_path: Path, *args: Any) -> Path:
+def cwltool(tmp_path: Path, *args: Any, with_orcid: bool = False) -> Path:
     prov_folder = tmp_path / "provenance"
     prov_folder.mkdir()
     new_args = [
         "--enable-user-provenance",
         "--enable-host-provenance",
-        "--orcid",
-        TEST_ORCID,
         "--provenance",
         str(prov_folder),
     ]
+    if with_orcid:
+        new_args.extend(["--orcid", TEST_ORCID])
     new_args.extend(args)
     # Run within a temporary directory to not pollute git checkout
     tmp_dir = tmp_path / "cwltool-run"
@@ -59,61 +60,81 @@ def cwltool(tmp_path: Path, *args: Any) -> Path:
 
 
 @needs_docker
-def test_hello_workflow(tmp_path: Path) -> None:
+@pytest.mark.parametrize("with_orcid", [True, False])
+def test_hello_workflow(tmp_path: Path, with_orcid: bool) -> None:
     check_provenance(
         cwltool(
             tmp_path,
             get_data("tests/wf/hello-workflow.cwl"),
             "--usermessage",
             "Hello workflow",
-        )
+            with_orcid=with_orcid,
+        ),
+        with_orcid=with_orcid,
     )
 
 
 @needs_docker
-def test_hello_single_tool(tmp_path: Path) -> None:
+@pytest.mark.parametrize("with_orcid", [True, False])
+def test_hello_single_tool(tmp_path: Path, with_orcid: bool) -> None:
     check_provenance(
         cwltool(
             tmp_path,
             get_data("tests/wf/hello_single_tool.cwl"),
             "--message",
             "Hello tool",
+            with_orcid=with_orcid,
         ),
         single_tool=True,
+        with_orcid=with_orcid,
     )
 
 
 @needs_docker
-def test_revsort_workflow(tmp_path: Path) -> None:
+@pytest.mark.parametrize("with_orcid", [True, False])
+def test_revsort_workflow(tmp_path: Path, with_orcid: bool) -> None:
     folder = cwltool(
         tmp_path,
         get_data("tests/wf/revsort.cwl"),
         get_data("tests/wf/revsort-job.json"),
+        with_orcid=with_orcid,
     )
     check_output_object(folder)
-    check_provenance(folder)
+    check_provenance(folder, with_orcid=with_orcid)
 
 
 @needs_docker
-def test_revsort_workflow_shortcut(tmp_path: Path) -> None:
+@pytest.mark.parametrize("with_orcid", [True, False])
+def test_revsort_workflow_shortcut(tmp_path: Path, with_orcid: bool) -> None:
     """Confirm that using 'cwl:tool' shortcut still snapshots the CWL files."""
     folder = cwltool(
         tmp_path,
         get_data("tests/wf/revsort-job-shortcut.json"),
+        with_orcid=with_orcid,
     )
     check_output_object(folder)
-    check_provenance(folder)
+    check_provenance(folder, with_orcid=with_orcid)
     assert not (folder / "snapshot" / "revsort-job-shortcut.json").exists()
     assert len(list((folder / "snapshot").iterdir())) == 4
 
 
 @needs_docker
-def test_nested_workflow(tmp_path: Path) -> None:
-    check_provenance(cwltool(tmp_path, get_data("tests/wf/nested.cwl")), nested=True)
+@pytest.mark.parametrize("with_orcid", [True, False])
+def test_nested_workflow(tmp_path: Path, with_orcid: bool) -> None:
+    check_provenance(
+        cwltool(
+            tmp_path,
+            get_data("tests/wf/nested.cwl"),
+            with_orcid=with_orcid,
+        ),
+        nested=True,
+        with_orcid=with_orcid,
+    )
 
 
 @needs_docker
-def test_secondary_files_implicit(tmp_path: Path) -> None:
+@pytest.mark.parametrize("with_orcid", [True, False])
+def test_secondary_files_implicit(tmp_path: Path, with_orcid: bool) -> None:
     file1 = tmp_path / "foo1.txt"
     file1idx = tmp_path / "foo1.txt.idx"
 
@@ -123,13 +144,20 @@ def test_secondary_files_implicit(tmp_path: Path) -> None:
         f.write("bar")
 
     # secondary will be picked up by .idx
-    folder = cwltool(tmp_path, get_data("tests/wf/sec-wf.cwl"), "--file1", str(file1))
-    check_provenance(folder, secondary_files=True)
+    folder = cwltool(
+        tmp_path,
+        get_data("tests/wf/sec-wf.cwl"),
+        "--file1",
+        str(file1),
+        with_orcid=with_orcid,
+    )
+    check_provenance(folder, secondary_files=True, with_orcid=with_orcid)
     check_secondary_files(folder)
 
 
 @needs_docker
-def test_secondary_files_explicit(tmp_path: Path) -> None:
+@pytest.mark.parametrize("with_orcid", [True, False])
+def test_secondary_files_explicit(tmp_path: Path, with_orcid: bool) -> None:
     # Deliberately do NOT have common basename or extension
     file1dir = tmp_path / "foo"
     file1dir.mkdir()
@@ -164,22 +192,33 @@ def test_secondary_files_explicit(tmp_path: Path) -> None:
         j = json.dumps(job, ensure_ascii=True)
         fp.write(j.encode("ascii"))
 
-    folder = cwltool(tmp_path, get_data("tests/wf/sec-wf.cwl"), str(jobJson))
-    check_provenance(folder, secondary_files=True)
+    folder = cwltool(
+        tmp_path,
+        get_data("tests/wf/sec-wf.cwl"),
+        str(jobJson),
+        with_orcid=with_orcid,
+    )
+    check_provenance(folder, secondary_files=True, with_orcid=with_orcid)
     check_secondary_files(folder)
 
 
 @needs_docker
-def test_secondary_files_output(tmp_path: Path) -> None:
+@pytest.mark.parametrize("with_orcid", [True, False])
+def test_secondary_files_output(tmp_path: Path, with_orcid: bool) -> None:
     # secondary will be picked up by .idx
-    folder = cwltool(tmp_path, get_data("tests/wf/sec-wf-out.cwl"))
-    check_provenance(folder, secondary_files=True)
+    folder = cwltool(
+        tmp_path,
+        get_data("tests/wf/sec-wf-out.cwl"),
+        with_orcid=with_orcid,
+    )
+    check_provenance(folder, secondary_files=True, with_orcid=with_orcid)
     # Skipped, not the same secondary files as above
     # self.check_secondary_files()
 
 
 @needs_docker
-def test_directory_workflow(tmp_path: Path) -> None:
+@pytest.mark.parametrize("with_orcid", [True, False])
+def test_directory_workflow(tmp_path: Path, with_orcid: bool) -> None:
     dir2 = tmp_path / "dir2"
     dir2.mkdir()
     sha1 = {
@@ -195,8 +234,14 @@ def test_directory_workflow(tmp_path: Path) -> None:
         with open(dir2 / x, "w", encoding="ascii") as f:
             f.write(x)
 
-    folder = cwltool(tmp_path, get_data("tests/wf/directory.cwl"), "--dir", str(dir2))
-    check_provenance(folder, directory=True)
+    folder = cwltool(
+        tmp_path,
+        get_data("tests/wf/directory.cwl"),
+        "--dir",
+        str(dir2),
+        with_orcid=with_orcid,
+    )
+    check_provenance(folder, directory=True, with_orcid=with_orcid)
 
     # Output should include ls stdout of filenames a b c on each line
     file_list = (
@@ -219,10 +264,12 @@ def test_directory_workflow(tmp_path: Path) -> None:
 
 
 @needs_docker
-def test_no_data_files(tmp_path: Path) -> None:
+@pytest.mark.parametrize("with_orcid", [True, False])
+def test_no_data_files(tmp_path: Path, with_orcid: bool) -> None:
     folder = cwltool(
         tmp_path,
         get_data("tests/wf/conditional_step_no_inputs.cwl"),
+        with_orcid=with_orcid,
     )
     check_bagit(folder)
 
@@ -273,6 +320,7 @@ def check_provenance(
     single_tool: bool = False,
     directory: bool = False,
     secondary_files: bool = False,
+    with_orcid: bool = False,
 ) -> None:
     check_folders(base_path)
     check_bagit(base_path)
@@ -283,6 +331,7 @@ def check_provenance(
         single_tool=single_tool,
         directory=directory,
         secondary_files=secondary_files,
+        with_orcid=with_orcid,
     )
 
 
@@ -473,6 +522,7 @@ def check_prov(
     single_tool: bool = False,
     directory: bool = False,
     secondary_files: bool = False,
+    with_orcid: bool = False,
 ) -> None:
     prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt"
     assert prov_file.is_file(), f"Can't find {prov_file}"
@@ -512,10 +562,20 @@ def check_prov(
     ) in g, "Engine not declared as SoftwareAgent"
 
     # run should be associated to the user
+    accounts = set(g.subjects(RDF.type, FOAF.OnlineAccount))
+    assert len(accounts) == 1
+    account = accounts.pop()
     people = set(g.subjects(RDF.type, SCHEMA.Person))
     assert len(people) == 1, "Can't find associated person in workflow run"
     person = people.pop()
-    assert person == URIRef(TEST_ORCID)
+    if with_orcid:
+        assert person == URIRef(TEST_ORCID)
+    else:
+        account_names = set(g.objects(account, FOAF.accountName))
+        assert len(account_names) == 1
+        account_name = account_names.pop()
+        machine_user = provenance._whoami()[0]
+        assert account_name.value == machine_user
 
     # find the random UUID assigned to cwltool
     tool_agents = set(g.subjects(RDF.type, PROV.SoftwareAgent))
@@ -528,9 +588,8 @@ def check_prov(
     agents.remove(engine)  # the main tool
     remain_agents = agents - tool_agents
     assert len(remain_agents) == 1
-    cwltool_agent = remain_agents.pop()
     assert (
-        cwltool_agent,
+        account,
         PROV.actedOnBehalfOf,
         person,
     ) in g, "Association of cwltool agent acting for user is missing"