Merge branch 'PROV_io' of https://github.com/rcali21/pydra

djarecka · djarecka · commit 54bb7642b1f6 · 2022-11-18T14:50:43.000-05:00
diff --git a/pydra/engine/audit.py b/pydra/engine/audit.py
@@ -4,7 +4,7 @@
 import json
 import attr
 from ..utils.messenger import send_message, make_message, gen_uuid, now, AuditFlag
-from .helpers import ensure_list, gather_runtime_info
+from .helpers import ensure_list, gather_runtime_info, hash_file
 
 
 class Audit:
@@ -170,22 +170,65 @@ def audit_check(self, flag):
         return self.audit_flags & flag
 
     def audit_task(self, task):
+        import subprocess as sp
+
         label = task.name
+        entity_label = type(label)
+
         if hasattr(task.inputs, "executable"):
             command = task.cmdline
         # assume function task
         else:
-            # work on changing this to function name
             command = None
 
+        if hasattr(task.inputs, "in_file"):
+            input_file = task.inputs.in_file
+            file_hash = hash_file(input_file)
+            at_location = os.path.abspath(input_file)
+        else:
+            file_hash = None
+            at_location = None
+            input_file = None
+
+        if command is not None:
+            cmd_name = command.split()[0]
+            software = f"{cmd_name} --version"
+            # take the first word of command as the
+            # name of the executable
+            # (this may not always be the case)
+            version_cmd = sp.run(software, shell=True, stdout=sp.PIPE).stdout.decode(
+                "utf-8"
+            )
+            try:
+                version_cmd = version_cmd.splitlines()[0]
+
+            except IndexError:
+                version_cmd = f"{cmd_name} -- Version unknown"
+
+        else:
+            version_cmd = None
+
         start_message = {
             "@id": self.aid,
             "@type": "task",
-            "label": label,
-            "command": command,
-            "startedAtTime": now(),
+            "Label": label,
+            "Command": command,
+            "StartedAtTime": now(),
+            "AssociatedWith": version_cmd,
         }
-        self.audit_message(start_message, AuditFlag.PROV)
 
+        entity_message = {
+            "@id": self.aid,
+            "Label": print(entity_label),
+            "AtLocation": at_location,
+            "GeneratedBy": "test",  # if not part of workflow, this will be none
+            "@type": "input",
+            "digest": file_hash,  # hash value under helpers.py
+        }
+
+        # new code to be added here for i/o tracking - WIP
+
+        self.audit_message(start_message, AuditFlag.PROV)
+        self.audit_message(entity_message, AuditFlag.PROV)
         # add more fields according to BEP208 doc
         # with every field, check in tests
diff --git a/pydra/engine/tests/test_task.py b/pydra/engine/tests/test_task.py
@@ -11,8 +11,17 @@
 from ..core import Workflow
 from ..task import AuditFlag, ShellCommandTask, DockerTask, SingularityTask
 from ...utils.messenger import FileMessenger, PrintMessenger, collect_messages
-from .utils import gen_basic_wf, use_validator
-from ..specs import MultiInputObj, MultiOutputObj, SpecInfo, FunctionSpec, BaseSpec
+from .utils import gen_basic_wf, use_validator, Submitter
+from ..specs import (
+    MultiInputObj,
+    MultiOutputObj,
+    SpecInfo,
+    FunctionSpec,
+    BaseSpec,
+    ShellSpec,
+    File,
+)
+from ..helpers import hash_file
 
 no_win = pytest.mark.skipif(
     sys.platform.startswith("win"),
@@ -998,15 +1007,30 @@ def testfunc(a: int, b: float = 0.1) -> ty.NamedTuple("Output", [("out", float)]
     funky.cache_dir = tmpdir
     funky()
     message_path = tmpdir / funky.checksum / "messages"
+    print(message_path)
     # go through each jsonld file in message_path and check if the label field exists
     json_content = []
+
     for file in glob(str(message_path) + "/*.jsonld"):
         with open(file, "r") as f:
             data = json.load(f)
-            if "label" in data:
-                json_content.append(True)
-                assert "testfunc" == data["label"]
-    assert any(json_content)
+            if "@type" in data:
+                if "AssociatedWith" in data:
+                    assert "testfunc" in data["Label"]
+
+            if "@type" in data:
+                if data["@type"] == "input":
+                    assert None == data["Label"]
+                    # placeholder for atlocation until
+                    # new test is added
+                    assert None == data["AtLocation"]
+
+                # assert data["Type"] == "input"
+
+            if "AssociatedWith" in data:
+                assert None == data["AssociatedWith"]
+
+    # assert any(json_content)
 
 
 def test_audit_shellcommandtask(tmpdir):
@@ -1025,20 +1049,106 @@ def test_audit_shellcommandtask(tmpdir):
     shelly()
     message_path = tmpdir / shelly.checksum / "messages"
     # go through each jsonld file in message_path and check if the label field exists
-    label_content = []
+
     command_content = []
 
     for file in glob(str(message_path) + "/*.jsonld"):
         with open(file, "r") as f:
             data = json.load(f)
-            if "label" in data:
-                label_content.append(True)
-            if "command" in data:
+
+            if "@type" in data:
+                if "AssociatedWith" in data:
+                    assert "shelly" in data["Label"]
+
+            if "@type" in data:
+                if data["@type"] == "input":
+                    assert data["Label"] == None
+
+            if "Command" in data:
                 command_content.append(True)
-                assert "ls -l" == data["command"]
+                assert "ls -l" == data["Command"]
+
+    assert any(command_content)
+
+
+def test_audit_shellcommandtask_file(tmpdir):
+    # create test.txt file with "This is a test" in it in the tmpdir
+    with open(tmpdir / "test.txt", "w") as f:
+        f.write("This is a test.")
+
+    cmd = "cat"
+    file_in = tmpdir / "test.txt"
+    test_file_hash = hash_file(file_in)
+    my_input_spec = SpecInfo(
+        name="Input",
+        fields=[
+            (
+                "in_file",
+                attr.ib(
+                    type=File,
+                    metadata={
+                        "position": 1,
+                        "argstr": "",
+                        "help_string": "text",
+                        "mandatory": True,
+                    },
+                ),
+            )
+        ],
+        bases=(ShellSpec,),
+    )
+    shelly = ShellCommandTask(
+        name="shelly",
+        in_file=file_in,
+        input_spec=my_input_spec,
+        executable=cmd,
+        audit_flags=AuditFlag.PROV,
+        messengers=PrintMessenger(),
+    )
+    shelly.cache_dir = tmpdir
+    shelly()
+    message_path = tmpdir / shelly.checksum / "messages"
+    for file in glob.glob(str(message_path) + "/*.jsonld"):
+        with open(file, "r") as f:
+            data = json.load(f)
+            print(file_in)
+            if "AtLocation" in data:
+                assert data["AtLocation"] == str(file_in)
+            if "digest" in data:
+                assert test_file_hash == data["digest"]
+
+
+def test_audit_shellcommandtask_version(tmpdir):
+    import subprocess as sp
+
+    version_cmd = sp.run("less --version", shell=True, stdout=sp.PIPE).stdout.decode(
+        "utf-8"
+    )
+    version_cmd = version_cmd.splitlines()[0]
+    cmd = "less"
+    shelly = ShellCommandTask(
+        name="shelly",
+        executable=cmd,
+        args="test_task.py",
+        audit_flags=AuditFlag.PROV,
+        messengers=FileMessenger(),
+    )
+
+    import glob
+
+    shelly.cache_dir = tmpdir
+    shelly()
+    message_path = tmpdir / shelly.checksum / "messages"
+    # go through each jsonld file in message_path and check if the label field exists
+    version_content = []
+    for file in glob.glob(str(message_path) + "/*.jsonld"):
+        with open(file, "r") as f:
+            data = json.load(f)
+            if "AssociatedWith" in data:
+                if version_cmd in data["AssociatedWith"]:
+                    version_content.append(True)
 
-    print(command_content)
-    assert any(label_content)
+    assert any(version_content)
 
 
 def test_audit_prov_messdir_1(tmpdir, use_validator):
@@ -1137,7 +1247,7 @@ def testfunc(a: int, b: float = 0.1) -> ty.NamedTuple("Output", [("out", float)]
     from glob import glob
 
     assert len(glob(str(tmpdir / funky.checksum / "proc*.log"))) == 1
-    assert len(glob(str(message_path / "*.jsonld"))) == 7
+    assert len(glob(str(message_path / "*.jsonld"))) == 8
 
     # commented out to speed up testing
     collect_messages(tmpdir / funky.checksum, message_path, ld_op="compact")