feat: RemoteRunner refactoring and new features

aldbr · aldbr · commit b42aa1f0b1e0 · 2023-04-12T09:40:56.000+02:00
diff --git a/src/DIRAC/Workflow/Modules/Script.py b/src/DIRAC/Workflow/Modules/Script.py
@@ -13,7 +13,7 @@
 import shlex
 import distutils.spawn  # pylint: disable=no-name-in-module,no-member,import-error
 
-from DIRAC import gLogger
+from DIRAC import gLogger, gConfig
 from DIRAC.Core.Utilities.Subprocess import systemCall
 from DIRAC.WorkloadManagementSystem.Utilities.RemoteRunner import RemoteRunner
 from DIRAC.Workflow.Modules.ModuleBase import ModuleBase
@@ -88,8 +88,14 @@ def _executeCommand(self):
         """execute the self.command (uses systemCall)"""
         failed = False
 
-        remoteRunner = RemoteRunner()
-        if remoteRunner.is_remote_execution():
+        # Check whether the execution should be done remotely
+        is_remote_execution = gConfig.getValue("/LocalSite/RemoteExecution", "false")
+        if is_remote_execution.lower() in ["true", "yes"]:
+            remoteRunner = RemoteRunner(
+                gConfig.getValue("/LocalSite/Site"),
+                gConfig.getValue("/LocalSite/GridCE"),
+                gConfig.getValue("/LocalSite/CEQueue"),
+            )
             retVal = remoteRunner.execute(self.command)
         else:
             retVal = systemCall(
diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py b/src/DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py
@@ -1,62 +1,99 @@
 """ RemoteRunner
 
+RemoteRunner has been designed to send scripts/applications and input files on remote worker nodes having
+no outbound connectivity (e.g. supercomputers)
+
 Mostly called by workflow modules, RemoteRunner is generally the last component to get through before
 the script/application execution on a remote machine.
-Depending on an environment variable WORKLOADEXECLOCATION, it decides whether it should take care of the execution.
-RemoteRunner has been designed to send script/application on remote worker nodes having no outbound connectivity
-(e.g. supercomputers)
 """
 import os
 import shlex
+from six.moves import shlex_quote
 import time
 
-from DIRAC import gLogger, gConfig, S_OK
+from DIRAC import gLogger, gConfig, S_OK, S_ERROR
 from DIRAC.Core.Security.ProxyInfo import getProxyInfo
+from DIRAC.Core.Utilities.Decorators import deprecated
 from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory
 from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getQueue
 from DIRAC.WorkloadManagementSystem.Client import PilotStatus
 
 
 class RemoteRunner(object):
-    def __init__(self):
+    def __init__(self, siteName=None, ceName=None, queueName=None):
         self.log = gLogger.getSubLogger("RemoteRunner")
-        self.remoteExecution = gConfig.getValue("/LocalSite/RemoteExecution", "false")
-
+        self._workloadSite = siteName
+        if not self._workloadSite:
+            self.log.warn("You are expected to provide a siteName in parameters from v8.0")
+            self.log.warn("Trying to get workloadSite from /LocalSite/Site...")
+            self._workloadSite = gConfig.getValue("/LocalSite/Site")
+        self._workloadCE = ceName
+        if not self._workloadCE:
+            self.log.warn("You are expected to provide a ceName in parameters from v8.0")
+            self.log.warn("Trying to get workloadSite from /LocalSite/GridCE...")
+            self._workloadCE = gConfig.getValue("/LocalSite/GridCE")
+        self._workloadQueue = queueName
+        if not self._workloadQueue:
+            self.log.warn("You are expected to provide a queueName in parameters from v8.0")
+            self.log.warn("Trying to get workloadSite from /LocalSite/CEQueue...")
+            self._workloadQueue = gConfig.getValue("/LocalSite/CEQueue")
+
+    @deprecated('Use gConfig.getValue("/LocalSite/RemoteExecution") instead.')
     def is_remote_execution(self):
         """Main method: decides whether the execution will be done locally or remotely via a CE.
 
+        This method does not really make sense: if we use RemoteRunner, it means we want to perform a remote execution.
+        Therefore, this should be checked before calling RemoteRunner by checking /LocalSite/RemoteExecution for instance.
+
         :return: bool
         """
+        return gConfig.getValue("/LocalSite/RemoteExecution")
 
-        # if remoteExecution is true, this means the workload should be executed
-        # in a different remote location. This mainly happens when the remote Site has no
-        # external connectivity and can only execute the workload itself.
-        return self.remoteExecution.lower() in ["true", "yes"]
-
-    def execute(self, command):
+    def execute(self, command, workingDirectory=".", numberOfProcessors=1, cleanRemoteJob=True):
         """Execute the command remotely via a CE
 
         :param str command: command to execute remotely
+        :param str workingDirectory: directory containing the inputs required by the command
+        :param int numberOfProcessors: number of processors to allocate to the command
+        :param str cleanRemoteJob: clean the files related to the command on the remote host if True
+        :return: (status, output, error)
         """
+        self.log.verbose("Command to submit:", command)
+
+        # Check whether CE parameters are set
+        result = self._checkParameters()
+        if not result["OK"]:
+            result["Value"] = (-1, "", result["Message"])
+            return result
+        self.log.verbose(
+            "The command will be sent to",
+            "site %s, CE %s, queue %s" % (self._workloadSite, self._workloadCE, self._workloadQueue),
+        )
+
         # Set up Application Queue
-        self.log.verbose("Remote application execution on:", self.remoteExecution)
-        result = self._setUpworkloadCE()
+        result = self._setUpWorkloadCE(numberOfProcessors)
         if not result["OK"]:
+            result["Value"] = (-1, "", result["Message"])
             return result
         workloadCE = result["Value"]
+        self.log.debug("The CE interface has been set up")
 
         # Add the command in an executable file
-        executable = self._wrapCommand(command)
-        # get inputs file from the current working directory
-        inputs = os.listdir(".")
+        executable = "workloadExec.sh"
+        self._wrapCommand(command, workingDirectory, executable)
+        self.log.debug("The command has been wrapped into an executable")
+
+        # Get inputs from the current working directory
+        inputs = os.listdir(workingDirectory)
         inputs.remove(os.path.basename(executable))
         self.log.verbose("The executable will be sent along with the following inputs:", ",".join(inputs))
-        # request the whole directory as output
+        # Request the whole directory as output
         outputs = ["/"]
 
         # Submit the command as a job
         result = workloadCE.submitJob(executable, workloadCE.proxy, inputs=inputs, outputs=outputs)
         if not result["OK"]:
+            result["Value"] = (-1, "", result["Message"])
             return result
         jobID = result["Value"][0]
         stamp = result["PilotStampDict"][jobID]
@@ -67,46 +104,83 @@ def execute(self, command):
             time.sleep(120)
             result = workloadCE.getJobStatus([jobID])
             if not result["OK"]:
+                result["Value"] = (-1, "", result["Message"])
                 return result
             jobStatus = result["Value"][jobID]
         self.log.verbose("The final status of the application/script is: ", jobStatus)
 
         # Get job outputs
         result = workloadCE.getJobOutput("%s:::%s" % (jobID, stamp), os.path.abspath("."))
         if not result["OK"]:
+            result["Value"] = (-1, "", result["Message"])
             return result
+        output, error = result["Value"]
+
+        # Clean job on the remote resource
+        if cleanRemoteJob:
+            result = workloadCE.cleanJob(jobID)
+            if not result["OK"]:
+                result["Value"] = (-1, "", result["Message"])
+                return result
 
         commandStatus = {"Done": 0, "Failed": -1, "Killed": -2}
-        output, error = result["Value"]
-        outputDict = {"OK": True, "Value": [commandStatus[jobStatus], output, error]}
-        return outputDict
+        return S_OK((commandStatus[jobStatus], output, error))
+
+    def _checkParameters(self):
+        """Initialize the remote runner using the parameters of the CS.
+        :return: S_OK, S_ERROR
+        """
+        if not self._workloadSite:
+            return S_ERROR("The remote site is not defined")
+        if not self._workloadCE:
+            return S_ERROR("The remote CE is not defined")
+        if not self._workloadQueue:
+            return S_ERROR("The remote queue is not defined")
 
-    def _setUpworkloadCE(self):
+        return S_OK()
+
+    def _setUpWorkloadCE(self, numberOfProcessorsPayload=1):
         """Get application queue and configure it
 
         :return: a ComputingElement instance
         """
-        # Get CE parameters
-        workloadSite = gConfig.getValue("/LocalSite/Site")
-        workloadCE = gConfig.getValue("/LocalSite/GridCE")
-        workloadQueue = gConfig.getValue("/LocalSite/CEQueue")
-
-        result = getQueue(workloadSite, workloadCE, workloadQueue)
+        # Get CE Parameters
+        result = getQueue(self._workloadSite, self._workloadCE, self._workloadQueue)
         if not result["OK"]:
             return result
         ceType = result["Value"]["CEType"]
         ceParams = result["Value"]
 
         # Build CE
         ceFactory = ComputingElementFactory()
-        result = ceFactory.getCE(ceName=workloadCE, ceType=ceType, ceParametersDict=ceParams)
+        result = ceFactory.getCE(ceName=self._workloadCE, ceType=ceType, ceParametersDict=ceParams)
         if not result["OK"]:
             return result
         workloadCE = result["Value"]
 
+        # Set the number of processors available according to the need of the payload
+        numberOfProcessorsCE = workloadCE.ceParameters.get("NumberOfProcessors", 1)
+        if numberOfProcessorsCE < 1 or numberOfProcessorsPayload < 1:
+            self.log.warn(
+                "Inappropriate values:",
+                "number of processors required for the payload %s - for the CE %s"
+                % (numberOfProcessorsPayload, numberOfProcessorsCE),
+            )
+            return S_ERROR("Inappropriate NumberOfProcessors value")
+
+        if numberOfProcessorsPayload > numberOfProcessorsCE:
+            self.log.warn(
+                "Not enough processors to execute the payload: ",
+                "number of processors required for the payload %s < %s the WN capacity"
+                % (numberOfProcessorsPayload, numberOfProcessorsCE),
+            )
+            return S_ERROR("Not enough processors to execute the command")
+
+        workloadCE.ceParameters["NumberOfProcessors"] = numberOfProcessorsPayload
+
         # Add a proxy to the CE
         result = getProxyInfo()
-        if not result["OK"] and not result["Value"]["chain"]:
+        if not result["OK"]:
             return result
         proxy = result["Value"]["chain"]
         result = proxy.getRemainingSecs()
@@ -117,13 +191,36 @@ def _setUpworkloadCE(self):
 
         return S_OK(workloadCE)
 
-    def _wrapCommand(self, command):
+    def _wrapCommand(self, command, workingDirectory, executable):
         """Wrap the command in a file
 
         :param str command: command line to write in the executable
-        :return: name of the executable file
+        :param str workingDirectory: directory containing the inputs required by the command
+        :param str executable: path of the executable that should contain the command to submit
+        :return: path of the executable
         """
-        executable = "workloadExec.sh"
+        # Check whether the command contains any absolute path: there would be no way to access them remotely
+        # They need to be converted into relative path
+        argumentsProcessed = []
+        for argument in shlex.split(command):
+
+            argPath = os.path.dirname(argument)
+            # The argument does not contain any path, not concerned
+            if not argPath:
+                argumentsProcessed.append(argument)
+                continue
+
+            argPathAbsolutePath = os.path.abspath(argPath)
+            workingDirAbsolutePath = os.path.abspath(workingDirectory)
+            # The argument is not included in the workingDirectory, not concerned
+            if not argPathAbsolutePath.startswith(workingDirAbsolutePath):
+                argumentsProcessed.append(argument)
+                continue
+
+            # The argument is included in the workingDirectory and should be converted
+            argumentsProcessed.append(os.path.join(".", os.path.basename(argument)))
+
+        # Fro v8.0, use: shlex.join(argumentsProcessed)
+        command = " ".join(shlex_quote(arg) for arg in argumentsProcessed)
         with open(executable, "w") as f:
             f.write(command)
-        return executable
diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/test/Test_RemoteRunner.py b/src/DIRAC/WorkloadManagementSystem/Utilities/test/Test_RemoteRunner.py
@@ -0,0 +1,110 @@
+""" Test class for Job Agent
+"""
+
+# imports
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import pytest
+import os
+from diraccfg import CFG
+
+# DIRAC Components
+from DIRAC import gLogger, gConfig, S_OK
+from DIRAC.ConfigurationSystem.Client.ConfigurationData import gConfigurationData
+from DIRAC.Core.Security.X509Chain import X509Chain  # pylint: disable=import-error
+from DIRAC.WorkloadManagementSystem.Utilities.RemoteRunner import RemoteRunner
+
+gLogger.setLevel("DEBUG")
+
+
+@pytest.mark.parametrize(
+    "command, workingDirectory, expectedContent",
+    [
+        ("/path/to/script.sh", "/path/to", "./script.sh"),
+        ("/path/to/script.sh", "/another/path/to", "/path/to/script.sh"),
+        ("/path/to/script.sh arg1", "/path/to", "./script.sh arg1"),
+        ("/path/to/script.sh /path/to/arg1", "/path/to", "./script.sh ./arg1"),
+        ("/path/to/script.sh /anotherpath/to/arg1", "/path/to", "./script.sh /anotherpath/to/arg1"),
+        ("/path/to/script.sh /another/path/to/arg1", "/path/to", "./script.sh /another/path/to/arg1"),
+        ("./script.sh", ".", "./script.sh"),
+        ("ls", "/path/to", "ls"),
+        ("echo 'Hello World'", "/path/to", "echo 'Hello World'"),
+        (
+            "lb-prod-run prodConf_Gauss_12345_12345.json --verbose",
+            ".",
+            "lb-prod-run prodConf_Gauss_12345_12345.json --verbose",
+        ),
+    ],
+)
+def test__wrapCommand(command, workingDirectory, expectedContent):
+    """Test RemoteRunner()._wrapCommand()"""
+    executable = "workloadExec.sh"
+
+    # Instantiate a RemoteRunner and wrap the command
+    remoteRunner = RemoteRunner("Site1", "CE1", "queue1")
+    remoteRunner._wrapCommand(command, workingDirectory, executable)
+
+    # Test the results
+    assert os.path.isfile(executable)
+    with open(executable, "r") as f:
+        content = f.read()
+    os.remove(executable)
+    assert content == expectedContent
+
+
+@pytest.mark.parametrize(
+    "payloadNumberOfProcessors, ceNumberOfProcessors, expectedResult, expectedNumberOfProcessors",
+    [
+        # CE has more processors than the payload requests
+        (1, 1, True, 1),
+        (2, 2, True, 2),
+        (1, 2, True, 1),
+        # CE has less processors than the payload requests
+        (2, 1, False, "Not enough processors to execute the command"),
+        # Specific case: we should not have 0
+        (0, 1, False, "Inappropriate NumberOfProcessors value"),
+        (1, 0, False, "Inappropriate NumberOfProcessors value"),
+        (-4, 1, False, "Inappropriate NumberOfProcessors value"),
+        (1, -4, False, "Inappropriate NumberOfProcessors value"),
+        (0, 0, False, "Inappropriate NumberOfProcessors value"),
+    ],
+)
+def test__setUpWorkloadCE(
+    mocker, payloadNumberOfProcessors, ceNumberOfProcessors, expectedResult, expectedNumberOfProcessors
+):
+    """Test RemoteRunner()._setUpWorkloadCE()"""
+    mocker.patch(
+        "DIRAC.WorkloadManagementSystem.Utilities.RemoteRunner.getProxyInfo", return_value=S_OK({"chain": X509Chain()})
+    )
+    mocker.patch("DIRAC.Core.Security.X509Chain.X509Chain.getRemainingSecs", return_value=S_OK(1000))
+
+    # Configure the CS with the number of available processors in the CE
+    siteName = "DIRAC.Site1.site"
+    ceName = "CE1"
+    queueName = "queue1"
+
+    config = {"Resources": {"Sites": {"DIRAC": {siteName: {"CEs": {ceName: {}}}}}}}
+    ceConfig = config["Resources"]["Sites"]["DIRAC"][siteName]["CEs"][ceName]
+    ceConfig["CEType"] = "HTCondorCE"
+    ceConfig["Queues"] = {}
+    ceConfig["Queues"][queueName] = {}
+    ceConfig["Queues"][queueName]["NumberOfProcessors"] = ceNumberOfProcessors
+
+    # Load the configuration
+    gConfigurationData.localCFG = CFG()
+    cfg = CFG()
+    cfg.loadFromDict(config)
+    gConfig.loadCFG(cfg)
+
+    # Instantiate a RemoteRunner and set up the CE
+    remoteRunner = RemoteRunner(siteName, ceName, queueName)
+    result = remoteRunner._setUpWorkloadCE(payloadNumberOfProcessors)
+
+    # Test the results
+    assert result["OK"] == expectedResult
+    if expectedResult:
+        workloadCE = result["Value"]
+        assert workloadCE.ceParameters["NumberOfProcessors"] == expectedNumberOfProcessors
+    else:
+        assert result["Message"] == expectedNumberOfProcessors