Skip to content

Commit 527884b

Browse files
fmigneaultmr-c
andauthored
Proposal: Improved ProvenanceProfile definition (#2082)
Co-authored-by: Michael R. Crusoe <[email protected]>
1 parent f1d192d commit 527884b

File tree

9 files changed

+225
-85
lines changed

9 files changed

+225
-85
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ eggs/
1616
*.egg
1717
.tox/
1818
.pytest_cache
19+
*.so
1920

2021
# Editor Temps
2122
.*.sw?

cwltool/context.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ def __init__(self, kwargs: Optional[dict[str, Any]] = None) -> None:
183183
self.orcid: str = ""
184184
self.cwl_full_name: str = ""
185185
self.process_run_id: Optional[str] = None
186+
self.prov_host: bool = False
187+
self.prov_user: bool = False
186188
self.prov_obj: Optional[ProvenanceProfile] = None
187189
self.mpi_config: MpiConfig = MpiConfig()
188190
self.default_stdout: Optional[Union[IO[bytes], TextIO]] = None

cwltool/cwlprov/provenance_profile.py

Lines changed: 4 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from collections.abc import MutableMapping, MutableSequence, Sequence
77
from io import BytesIO
88
from pathlib import PurePath, PurePosixPath
9-
from socket import getfqdn
109
from typing import TYPE_CHECKING, Any, Optional, Union, cast
1110

1211
from prov.identifier import Identifier, QualifiedName
@@ -24,12 +23,10 @@
2423
ACCOUNT_UUID,
2524
CWLPROV,
2625
ENCODING,
27-
FOAF,
2826
METADATA,
2927
ORE,
3028
PROVENANCE,
3129
RO,
32-
SCHEMA,
3330
SHA1,
3431
SHA256,
3532
TEXT_PLAIN,
@@ -108,25 +105,6 @@ def __str__(self) -> str:
108105

109106
def generate_prov_doc(self) -> tuple[str, ProvDocument]:
110107
"""Add basic namespaces."""
111-
112-
def host_provenance(document: ProvDocument) -> None:
113-
"""Record host provenance."""
114-
document.add_namespace(CWLPROV)
115-
document.add_namespace(UUID)
116-
document.add_namespace(FOAF)
117-
118-
hostname = getfqdn()
119-
# won't have a foaf:accountServiceHomepage for unix hosts, but
120-
# we can at least provide hostname
121-
document.agent(
122-
ACCOUNT_UUID,
123-
{
124-
PROV_TYPE: FOAF["OnlineAccount"],
125-
"prov:location": hostname,
126-
CWLPROV["hostname"]: hostname,
127-
},
128-
)
129-
130108
self.cwltool_version = f"cwltool {versionstring().split()[-1]}"
131109
self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#")
132110
# document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
@@ -165,25 +143,10 @@ def host_provenance(document: ProvDocument) -> None:
165143
# .. but we always know cwltool was launched (directly or indirectly)
166144
# by a user account, as cwltool is a command line tool
167145
account = self.document.agent(ACCOUNT_UUID)
168-
if self.orcid or self.full_name:
169-
person: dict[Union[str, Identifier], Any] = {
170-
PROV_TYPE: PROV["Person"],
171-
"prov:type": SCHEMA["Person"],
172-
}
173-
if self.full_name:
174-
person["prov:label"] = self.full_name
175-
person["foaf:name"] = self.full_name
176-
person["schema:name"] = self.full_name
177-
else:
178-
# TODO: Look up name from ORCID API?
179-
pass
180-
agent = self.document.agent(self.orcid or uuid.uuid4().urn, person)
181-
self.document.actedOnBehalfOf(account, agent)
182-
else:
183-
if self.host_provenance:
184-
host_provenance(self.document)
185-
if self.user_provenance:
186-
self.research_object.user_provenance(self.document)
146+
if self.host_provenance:
147+
self.research_object.host_provenance(self.document)
148+
if self.user_provenance or self.orcid or self.full_name:
149+
self.research_object.user_provenance(self.document)
187150
# The execution of cwltool
188151
wfengine = self.document.agent(
189152
self.engine_uuid,

cwltool/cwlprov/ro.py

Lines changed: 87 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@
99
import uuid
1010
from collections.abc import MutableMapping, MutableSequence
1111
from pathlib import Path, PurePosixPath
12-
from typing import IO, Any, Optional, Union, cast
12+
from socket import getfqdn
13+
from typing import IO, TYPE_CHECKING, Any, Optional, Union, cast
1314

1415
import prov.model as provM
15-
from prov.model import PROV, ProvDocument
16+
from prov.model import ProvDocument
1617

1718
from ..loghandler import _logger
1819
from ..stdfsaccess import StdFsAccess
@@ -27,6 +28,7 @@
2728
from . import Aggregate, Annotation, AuthoredBy, _valid_orcid, _whoami, checksum_copy
2829
from .provenance_constants import (
2930
ACCOUNT_UUID,
31+
CWLPROV,
3032
CWLPROV_VERSION,
3133
DATA,
3234
ENCODING,
@@ -35,6 +37,7 @@
3537
METADATA,
3638
ORCID,
3739
PROVENANCE,
40+
SCHEMA,
3841
SHA1,
3942
SHA256,
4043
SHA512,
@@ -46,6 +49,9 @@
4649
Hasher,
4750
)
4851

52+
if TYPE_CHECKING:
53+
from .provenance_profile import ProvenanceProfile # pylint: disable=unused-import
54+
4955

5056
class ResearchObject:
5157
"""CWLProv Research Object."""
@@ -82,6 +88,34 @@ def __init__(
8288
self._initialize()
8389
_logger.debug("[provenance] Temporary research object: %s", self.folder)
8490

91+
def initialize_provenance(
92+
self,
93+
full_name: str,
94+
host_provenance: bool,
95+
user_provenance: bool,
96+
orcid: str,
97+
fsaccess: StdFsAccess,
98+
run_uuid: Optional[uuid.UUID] = None,
99+
) -> "ProvenanceProfile":
100+
"""
101+
Provide a provenance profile initialization hook function.
102+
103+
Allows overriding the default strategy to define the
104+
provenance profile concepts and associations to extend
105+
details as needed.
106+
"""
107+
from .provenance_profile import ProvenanceProfile
108+
109+
return ProvenanceProfile(
110+
research_object=self,
111+
full_name=full_name,
112+
host_provenance=host_provenance,
113+
user_provenance=user_provenance,
114+
orcid=orcid,
115+
fsaccess=fsaccess,
116+
run_uuid=run_uuid,
117+
)
118+
85119
def self_check(self) -> None:
86120
"""Raise ValueError if this RO is closed."""
87121
if self.closed:
@@ -117,10 +151,22 @@ def _initialize_bagit(self) -> None:
117151
bag_it_file.write("BagIt-Version: 0.97\n")
118152
bag_it_file.write(f"Tag-File-Character-Encoding: {ENCODING}\n")
119153

154+
def resolve_user(self) -> tuple[str, str]:
155+
"""
156+
Provide a user provenance hook function.
157+
158+
Allows overriding the default strategy to retrieve user provenance
159+
in case the calling code can provide a better resolution.
160+
The function must return a tuple of the (username, fullname)
161+
that identifies the user. This user will be applied on top
162+
to any provided ORCID or fullname by agent association.
163+
"""
164+
return _whoami()
165+
120166
def user_provenance(self, document: ProvDocument) -> None:
121167
"""Add the user provenance."""
122168
self.self_check()
123-
(username, fullname) = _whoami()
169+
(username, fullname) = self.resolve_user()
124170

125171
if not self.full_name:
126172
self.full_name = fullname
@@ -132,19 +178,21 @@ def user_provenance(self, document: ProvDocument) -> None:
132178
ACCOUNT_UUID,
133179
{
134180
provM.PROV_TYPE: FOAF["OnlineAccount"],
135-
"prov:label": username,
181+
provM.PROV_LABEL: username,
136182
FOAF["accountName"]: username,
137183
},
138184
)
139185

140186
user = document.agent(
141187
self.orcid or USER_UUID,
142-
{
143-
provM.PROV_TYPE: PROV["Person"],
144-
"prov:label": self.full_name,
145-
FOAF["name"]: self.full_name,
146-
FOAF["account"]: account,
147-
},
188+
[
189+
(provM.PROV_TYPE, SCHEMA["Person"]),
190+
(provM.PROV_TYPE, provM.PROV["Person"]),
191+
(provM.PROV_LABEL, self.full_name),
192+
(FOAF["name"], self.full_name),
193+
(FOAF["account"], account),
194+
(SCHEMA["name"], self.full_name),
195+
],
148196
)
149197
# cwltool may be started on the shell (directly by user),
150198
# by shell script (indirectly by user)
@@ -156,6 +204,35 @@ def user_provenance(self, document: ProvDocument) -> None:
156204
# get their name wrong!)
157205
document.actedOnBehalfOf(account, user)
158206

207+
def resolve_host(self) -> tuple[str, str]:
208+
"""
209+
Provide a host provenance hook function.
210+
211+
Allows overriding the default strategy to retrieve host provenance
212+
in case the calling code can provide a better resolution.
213+
The function must return a tuple of the (fqdn, uri) that identifies the host.
214+
"""
215+
fqdn = getfqdn()
216+
return fqdn, fqdn # allow for (fqdn, uri) to be distinct, but the same by default
217+
218+
def host_provenance(self, document: ProvDocument) -> None:
219+
"""Record host provenance."""
220+
document.add_namespace(CWLPROV)
221+
document.add_namespace(UUID)
222+
document.add_namespace(FOAF)
223+
224+
hostname, uri = self.resolve_host()
225+
# won't have a foaf:accountServiceHomepage for unix hosts, but
226+
# we can at least provide hostname
227+
document.agent(
228+
ACCOUNT_UUID,
229+
{
230+
provM.PROV_TYPE: FOAF["OnlineAccount"],
231+
provM.PROV_LOCATION: uri,
232+
CWLPROV["hostname"]: hostname,
233+
},
234+
)
235+
159236
def add_tagfile(self, path: str, timestamp: Optional[datetime.datetime] = None) -> None:
160237
"""Add tag files to our research object."""
161238
self.self_check()

cwltool/executors.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from .command_line_tool import CallbackJob, ExpressionJob
2020
from .context import RuntimeContext, getdefault
2121
from .cuda import cuda_version_and_device_count
22-
from .cwlprov.provenance_profile import ProvenanceProfile
2322
from .errors import WorkflowException
2423
from .job import JobBase
2524
from .loghandler import _logger
@@ -194,11 +193,13 @@ def run_jobs(
194193

195194
# define provenance profile for single commandline tool
196195
if not isinstance(process, Workflow) and runtime_context.research_obj is not None:
197-
process.provenance_object = ProvenanceProfile(
198-
runtime_context.research_obj,
196+
process.provenance_object = runtime_context.research_obj.initialize_provenance(
199197
full_name=runtime_context.cwl_full_name,
200-
host_provenance=False,
201-
user_provenance=False,
198+
# following are only set from main when directly command line tool
199+
# when nested in a workflow, they should be disabled since they would
200+
# already have been provided/initialized by the parent workflow prov-obj
201+
host_provenance=runtime_context.prov_host,
202+
user_provenance=runtime_context.prov_user,
202203
orcid=runtime_context.orcid,
203204
# single tool execution, so RO UUID = wf UUID = tool UUID
204205
run_uuid=runtime_context.research_obj.ro_uuid,

cwltool/main.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,11 @@ def main(
10651065

10661066
loadingContext = setup_loadingContext(loadingContext, runtimeContext, args)
10671067

1068+
if loadingContext.research_obj:
1069+
# early forward parameters required for a single command line tool
1070+
runtimeContext.prov_host = loadingContext.host_provenance
1071+
runtimeContext.prov_user = loadingContext.user_provenance
1072+
10681073
uri, tool_file_uri = resolve_tool_uri(
10691074
args.workflow,
10701075
resolver=loadingContext.resolver,

cwltool/workflow.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,7 @@ def __init__(
7272
if is_main:
7373
run_uuid = loadingContext.research_obj.ro_uuid
7474

75-
self.provenance_object = ProvenanceProfile(
76-
loadingContext.research_obj,
75+
self.provenance_object = loadingContext.research_obj.initialize_provenance(
7776
full_name=loadingContext.cwl_full_name,
7877
host_provenance=loadingContext.host_provenance,
7978
user_provenance=loadingContext.user_provenance,

mypy-stubs/rdflib/graph.pyi

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ from rdflib import query
1616
from rdflib.collection import Collection
1717
from rdflib.paths import Path
1818
from rdflib.resource import Resource
19-
from rdflib.term import BNode, Identifier, Node
19+
from rdflib.term import BNode, Identifier, Literal, Node
2020

2121
class Graph(Node):
2222
base: Any = ...
@@ -66,7 +66,7 @@ class Graph(Node):
6666
) -> Iterable[Node]: ...
6767
def objects(
6868
self, subject: Optional[Any] = ..., predicate: Optional[Any] = ...
69-
) -> Iterable[Identifier]: ...
69+
) -> Iterable[Union[Identifier, Literal]]: ...
7070
def subject_predicates(self, object: Optional[Any] = ...) -> None: ...
7171
def subject_objects(self, predicate: Optional[Any] = ...) -> None: ...
7272
def predicate_objects(self, subject: Optional[Any] = ...) -> None: ...

0 commit comments

Comments
 (0)