Merge pull request #7032 from aldbr/rel-v8r0_FEAT_RemoteRunnerChecksum

fstagni · web-flow · commit daf87a20a44a · 2023-06-05T12:24:08.000+02:00
[8.0] feat &amp; fix: check the integrity of the outputs in RemoteRunner
diff --git a/src/DIRAC/Resources/Computing/AREXComputingElement.py b/src/DIRAC/Resources/Computing/AREXComputingElement.py
@@ -26,6 +26,7 @@
 import os
 import json
 import requests
+import shutil
 
 from DIRAC import S_OK, S_ERROR
 from DIRAC.Core.Security import Locations
@@ -144,7 +145,7 @@ def _urlJoin(self, command):
         """
         return os.path.join(self.base_url, command)
 
-    def _request(self, method, query, params=None, data=None, headers=None, timeout=None):
+    def _request(self, method, query, params=None, data=None, headers=None, timeout=None, stream=False):
         """Perform a request and properly handle the results/exceptions.
 
         :param str method: "post", "get", "put"
@@ -164,12 +165,7 @@ def _request(self, method, query, params=None, data=None, headers=None, timeout=
 
         try:
             response = self.session.request(
-                method,
-                query,
-                headers=headers,
-                params=params,
-                data=data,
-                timeout=timeout,
+                method, query, headers=headers, params=params, data=data, timeout=timeout, stream=stream
             )
             if not response.ok:
                 return S_ERROR(f"Response: {response.status_code} - {response.reason}")
@@ -811,20 +807,21 @@ def getJobOutput(self, jobID, workingDirectory=None):
             query = self._urlJoin(os.path.join("jobs", job, "session", remoteOutput))
 
             # Submit the GET request to retrieve outputs
-            result = self._request("get", query)
+            result = self._request("get", query, stream=True)
             if not result["OK"]:
                 self.log.error("Error downloading", f"{remoteOutput} for {job}: {result['Message']}")
                 return S_ERROR(f"Error downloading {remoteOutput} for {jobID}")
             response = result["Value"]
-            outputContent = response.text
+
+            localOutput = os.path.join(workingDirectory, remoteOutput)
+            with open(localOutput, "wb") as f:
+                shutil.copyfileobj(response.raw, f)
 
             if remoteOutput == f"{stamp}.out":
-                stdout = outputContent
-            elif remoteOutput == f"{stamp}.err":
-                stderr = outputContent
-            else:
-                localOutput = os.path.join(workingDirectory, remoteOutput)
-                with open(localOutput, "w") as f:
-                    f.write(outputContent)
+                with open(localOutput) as f:
+                    stdout = f.read()
+            if remoteOutput == f"{stamp}.err":
+                with open(localOutput) as f:
+                    stderr = f.read()
 
         return S_OK((stdout, stderr))
diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py b/src/DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py
@@ -6,6 +6,7 @@
 Mostly called by workflow modules, RemoteRunner is generally the last component to get through before
 the script/application execution on a remote machine.
 """
+import hashlib
 import os
 import shlex
 import time
@@ -22,6 +23,9 @@
 class RemoteRunner:
     def __init__(self, siteName=None, ceName=None, queueName=None):
         self.log = gLogger.getSubLogger("RemoteRunner")
+        self.executable = "workloadExec.sh"
+        self.checkSumOutput = "md5Checksum.txt"
+
         self._workloadSite = siteName
         if not self._workloadSite:
             self.log.warn("You are expected to provide a siteName in parameters from v8.0")
@@ -61,44 +65,44 @@ def execute(self, command, workingDirectory=".", numberOfProcessors=1, cleanRemo
         self.log.verbose("Command to submit:", command)
 
         # Check whether CE parameters are set
-        result = self._checkParameters()
-        if not result["OK"]:
+        if not (result := self._checkParameters())["OK"]:
             result["Errno"] = DErrno.ESECTION
             return result
-        self.log.verbose(
-            "The command will be sent to",
+        self.log.info(
+            "Preparing and submitting the command to",
             f"site {self._workloadSite}, CE {self._workloadCE}, queue {self._workloadQueue}",
         )
 
         # Set up Application Queue
-        result = self._setUpWorkloadCE(numberOfProcessors)
-        if not result["OK"]:
+        if not (result := self._setUpWorkloadCE(numberOfProcessors))["OK"]:
             result["Errno"] = DErrno.ERESUNA
             return result
         workloadCE = result["Value"]
         self.log.debug("The CE interface has been set up")
 
         # Add the command in an executable file
-        executable = "workloadExec.sh"
-        self._wrapCommand(command, workingDirectory, executable)
+        self._wrapCommand(command, workingDirectory)
         self.log.debug("The command has been wrapped into an executable")
 
         # Get inputs from the current working directory
         inputs = os.listdir(workingDirectory)
-        inputs.remove(os.path.basename(executable))
+        inputs.remove(os.path.basename(self.executable))
         self.log.verbose("The executable will be sent along with the following inputs:", ",".join(inputs))
         # Request the whole directory as output
         outputs = ["/"]
 
         # Submit the command as a job
-        result = workloadCE.submitJob(executable, workloadCE.proxy, inputs=inputs, outputs=outputs)
-        if not result["OK"]:
+        if not (result := workloadCE.submitJob(self.executable, workloadCE.proxy, inputs=inputs, outputs=outputs))[
+            "OK"
+        ]:
             result["Errno"] = DErrno.EWMSSUBM
             return result
         jobID = result["Value"][0]
         stamp = result["PilotStampDict"][jobID]
+        self.log.info("The command has been wrapped in a job and sent. Remote JobID: ", jobID)
 
         # Get status of the job
+        self.log.info("Waiting for the end of the job...")
         jobStatus = PilotStatus.RUNNING
         while jobStatus not in PilotStatus.PILOT_FINAL_STATES:
             time.sleep(120)
@@ -107,20 +111,27 @@ def execute(self, command, workingDirectory=".", numberOfProcessors=1, cleanRemo
                 result["Errno"] = DErrno.EWMSSTATUS
                 return result
             jobStatus = result["Value"][jobID]
-        self.log.verbose("The final status of the application/script is: ", jobStatus)
+        self.log.info("The final status of the application/script is: ", jobStatus)
 
         # Get job outputs
-        result = workloadCE.getJobOutput(f"{jobID}:::{stamp}", os.path.abspath("."))
-        if not result["OK"]:
+        self.log.info("Getting the outputs of the command...")
+        if not (result := workloadCE.getJobOutput(f"{jobID}:::{stamp}", os.path.abspath(".")))["OK"]:
             result["Errno"] = DErrno.EWMSJMAN
             return result
         output, error = result["Value"]
 
+        # Make sure the output is correct
+        self.log.info("Checking the integrity of the outputs...")
+        if not (result := self._checkOutputIntegrity("."))["OK"]:
+            result["Errno"] = DErrno.EWMSJMAN
+            return result
+        self.log.info("The output has been retrieved and declared complete")
+
         # Clean job in the remote resource
         if cleanRemoteJob:
-            result = workloadCE.cleanJob(jobID)
-            if not result["OK"]:
+            if not (result := workloadCE.cleanJob(jobID))["OK"]:
                 self.log.warn("Failed to clean the output remotely", result["Message"])
+            self.log.info("The job has been remotely removed")
 
         commandStatus = {"Done": 0, "Failed": -1, "Killed": -2}
         return S_OK((commandStatus[jobStatus], output, error))
@@ -190,12 +201,11 @@ def _setUpWorkloadCE(self, numberOfProcessorsPayload=1):
 
         return S_OK(workloadCE)
 
-    def _wrapCommand(self, command, workingDirectory, executable):
+    def _wrapCommand(self, command, workingDirectory):
         """Wrap the command in a file
 
         :param str command: command line to write in the executable
         :param str workingDirectory: directory containing the inputs required by the command
-        :param str executable: path of the executable that should contain the command to submit
         :return: path of the executable
         """
         # Check whether the command contains any absolute path: there would be no way to access them remotely
@@ -219,5 +229,34 @@ def _wrapCommand(self, command, workingDirectory, executable):
             argumentsProcessed.append(os.path.join(".", os.path.basename(argument)))
 
         command = shlex.join(argumentsProcessed)
-        with open(executable, "w") as f:
+        with open(self.executable, "w") as f:
             f.write(command)
+            # Post-processing: compute the checksum of the outputs
+            f.write(f"\nmd5sum * > {self.checkSumOutput}")
+
+    def _checkOutputIntegrity(self, workingDirectory):
+        """Make sure that output files are not corrupted.
+
+        :param str workingDirectory: path of the outputs
+        """
+        checkSumOutput = os.path.join(workingDirectory, self.checkSumOutput)
+        if not os.path.exists(checkSumOutput):
+            return S_ERROR(f"Cannot guarantee the integrity of the outputs: {checkSumOutput} unavailable")
+
+        with open(checkSumOutput) as f:
+            # for each output file, compute the md5 checksum
+            for line in f:
+                checkSum, remoteOutput = list(filter(None, line.strip("\n").split(" ")))
+
+                hash = hashlib.md5()
+                localOutput = os.path.join(workingDirectory, remoteOutput)
+                if not os.path.exists(localOutput):
+                    return S_ERROR(f"{localOutput} was expected but not found")
+
+                with open(localOutput, "rb") as f:
+                    while chunk := f.read(128 * hash.block_size):
+                        hash.update(chunk)
+                if checkSum != hash.hexdigest():
+                    return S_ERROR(f"{localOutput} is corrupted")
+
+        return S_OK()
diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/test/Test_RemoteRunner.py b/src/DIRAC/WorkloadManagementSystem/Utilities/test/Test_RemoteRunner.py
@@ -7,7 +7,7 @@
 from diraccfg import CFG
 
 # DIRAC Components
-from DIRAC import gLogger, gConfig, S_OK
+from DIRAC import gLogger, gConfig, S_OK, S_ERROR
 from DIRAC.ConfigurationSystem.Client.ConfigurationData import gConfigurationData
 from DIRAC.Core.Security.X509Chain import X509Chain  # pylint: disable=import-error
 from DIRAC.WorkloadManagementSystem.Utilities.RemoteRunner import RemoteRunner
@@ -40,13 +40,16 @@ def test__wrapCommand(command, workingDirectory, expectedContent):
 
     # Instantiate a RemoteRunner and wrap the command
     remoteRunner = RemoteRunner("Site1", "CE1", "queue1")
-    remoteRunner._wrapCommand(command, workingDirectory, executable)
+    remoteRunner._wrapCommand(command, workingDirectory)
 
     # Test the results
-    assert os.path.isfile(executable)
-    with open(executable) as f:
+    assert os.path.isfile(remoteRunner.executable)
+    with open(remoteRunner.executable) as f:
         content = f.read()
-    os.remove(executable)
+    os.remove(remoteRunner.executable)
+
+    # This line is added at the end of the wrapper for any command
+    expectedContent += f"\nmd5sum * > {remoteRunner.checkSumOutput}"
     assert content == expectedContent
 
 
@@ -105,3 +108,64 @@ def test__setUpWorkloadCE(
         assert workloadCE.ceParameters["NumberOfProcessors"] == expectedNumberOfProcessors
     else:
         assert result["Message"] == expectedNumberOfProcessors
+
+
+@pytest.mark.parametrize(
+    "checkSumDict, expectedResult",
+    [
+        # Normal case
+        ({"file1.txt": "826e8142e6baabe8af779f5f490cf5f5", "file2.txt": "1c1c96fd2cf8330db0bfa936ce82f3b9"}, S_OK()),
+        # Files are corrupted
+        (
+            {"file1.txt": "c12f72e7b198fdbfe5f70c66dc6082c8", "file2.txt": "5ec149e38f09fb716b1e0f4cf23af679"},
+            S_ERROR("./file1.txt is corrupted"),
+        ),
+        (
+            {"file1.txt": "826e8142e6baabe8af779f5f490cf5f5", "file2.txt": "5ec149e38f09fb716b1e0f4cf23af679"},
+            S_ERROR("./file2.txt is corrupted"),
+        ),
+        # Files do not exist
+        (
+            {
+                "file3.txt": "826e8142e6baabe8af779f5f490cf5f5",
+            },
+            S_ERROR("./file3.txt was expected but not found"),
+        ),
+        # remoteRunner.checkSumOutput is empty
+        ({}, S_OK()),
+        # remoteRunner.checkSumOutput does not exist
+        (None, S_ERROR("Cannot guarantee the integrity of the outputs")),
+    ],
+)
+def test__checkOutputIntegrity(checkSumDict, expectedResult):
+    """Test RemoteRunner()._checkOutputIntegrity()"""
+    # Instantiate a RemoteRunner
+    remoteRunner = RemoteRunner("Site1", "CE1", "queue1")
+
+    # Create some files in workingDirectory
+    workingDirectory = "."
+    with open(os.path.join(workingDirectory, "file1.txt"), "w") as f:
+        f.write("file1")
+    with open(os.path.join(workingDirectory, "file2.txt"), "w") as f:
+        f.write("file2")
+
+    # Create remoteRunner.checkSumOutput
+    if checkSumDict is not None:
+        with open(os.path.join(workingDirectory, remoteRunner.checkSumOutput), "w") as f:
+            for file, checkSum in checkSumDict.items():
+                f.write(f"{checkSum}  {file}\n")
+
+    # Check the integrity of the output
+    result = remoteRunner._checkOutputIntegrity(".")
+
+    # Test the results
+    print(result)
+    assert result["OK"] is expectedResult["OK"]
+    if not expectedResult["OK"]:
+        assert expectedResult["Message"] in result["Message"]
+
+    # Delete files
+    os.remove(os.path.join(workingDirectory, "file1.txt"))
+    os.remove(os.path.join(workingDirectory, "file2.txt"))
+    if os.path.exists(os.path.join(workingDirectory, remoteRunner.checkSumOutput)):
+        os.remove(remoteRunner.checkSumOutput)