1
1
""" RemoteRunner
2
2
3
+ RemoteRunner has been designed to send scripts/applications and input files on remote worker nodes having
4
+ no outbound connectivity (e.g. supercomputers)
5
+
3
6
Mostly called by workflow modules, RemoteRunner is generally the last component to get through before
4
7
the script/application execution on a remote machine.
5
- Depending on an environment variable WORKLOADEXECLOCATION, it decides whether it should take care of the execution.
6
- RemoteRunner has been designed to send script/application on remote worker nodes having no outbound connectivity
7
- (e.g. supercomputers)
8
8
"""
9
9
import os
10
10
import shlex
11
+ from six .moves import shlex_quote
11
12
import time
12
13
13
- from DIRAC import gLogger , gConfig , S_OK
14
+ from DIRAC import gLogger , gConfig , S_OK , S_ERROR
14
15
from DIRAC .Core .Security .ProxyInfo import getProxyInfo
16
+ from DIRAC .Core .Utilities .Decorators import deprecated
15
17
from DIRAC .Resources .Computing .ComputingElementFactory import ComputingElementFactory
16
18
from DIRAC .ConfigurationSystem .Client .Helpers .Resources import getQueue
17
19
from DIRAC .WorkloadManagementSystem .Client import PilotStatus
18
20
19
21
20
22
class RemoteRunner (object ):
21
- def __init__ (self ):
23
+ def __init__ (self , siteName = None , ceName = None , queueName = None ):
22
24
self .log = gLogger .getSubLogger ("RemoteRunner" )
23
- self .remoteExecution = gConfig .getValue ("/LocalSite/RemoteExecution" , "false" )
24
-
25
+ self ._workloadSite = siteName
26
+ if not self ._workloadSite :
27
+ self .log .warn ("You are expected to provide a siteName in parameters from v8.0" )
28
+ self .log .warn ("Trying to get workloadSite from /LocalSite/Site..." )
29
+ self ._workloadSite = gConfig .getValue ("/LocalSite/Site" )
30
+ self ._workloadCE = ceName
31
+ if not self ._workloadCE :
32
+ self .log .warn ("You are expected to provide a ceName in parameters from v8.0" )
33
+ self .log .warn ("Trying to get workloadSite from /LocalSite/GridCE..." )
34
+ self ._workloadCE = gConfig .getValue ("/LocalSite/GridCE" )
35
+ self ._workloadQueue = queueName
36
+ if not self ._workloadQueue :
37
+ self .log .warn ("You are expected to provide a queueName in parameters from v8.0" )
38
+ self .log .warn ("Trying to get workloadSite from /LocalSite/CEQueue..." )
39
+ self ._workloadQueue = gConfig .getValue ("/LocalSite/CEQueue" )
40
+
41
+ @deprecated ('Use gConfig.getValue("/LocalSite/RemoteExecution") instead.' )
25
42
def is_remote_execution (self ):
26
43
"""Main method: decides whether the execution will be done locally or remotely via a CE.
27
44
45
+ This method does not really make sense: if we use RemoteRunner, it means we want to perform a remote execution.
46
+ Therefore, this should be checked before calling RemoteRunner by checking /LocalSite/RemoteExecution for instance.
47
+
28
48
:return: bool
29
49
"""
50
+ return gConfig .getValue ("/LocalSite/RemoteExecution" )
30
51
31
- # if remoteExecution is true, this means the workload should be executed
32
- # in a different remote location. This mainly happens when the remote Site has no
33
- # external connectivity and can only execute the workload itself.
34
- return self .remoteExecution .lower () in ["true" , "yes" ]
35
-
36
- def execute (self , command ):
52
+ def execute (self , command , workingDirectory = "." , numberOfProcessors = 1 , cleanRemoteJob = True ):
37
53
"""Execute the command remotely via a CE
38
54
39
55
:param str command: command to execute remotely
56
+ :param str workingDirectory: directory containing the inputs required by the command
57
+ :param int numberOfProcessors: number of processors to allocate to the command
58
+ :param str cleanRemoteJob: clean the files related to the command on the remote host if True
59
+ :return: (status, output, error)
40
60
"""
61
+ self .log .verbose ("Command to submit:" , command )
62
+
63
+ # Check whether CE parameters are set
64
+ result = self ._checkParameters ()
65
+ if not result ["OK" ]:
66
+ result ["Value" ] = (- 1 , "" , result ["Message" ])
67
+ return result
68
+ self .log .verbose (
69
+ "The command will be sent to" ,
70
+ "site %s, CE %s, queue %s" % (self ._workloadSite , self ._workloadCE , self ._workloadQueue ),
71
+ )
72
+
41
73
# Set up Application Queue
42
- self .log .verbose ("Remote application execution on:" , self .remoteExecution )
43
- result = self ._setUpworkloadCE ()
74
+ result = self ._setUpWorkloadCE (numberOfProcessors )
44
75
if not result ["OK" ]:
76
+ result ["Value" ] = (- 1 , "" , result ["Message" ])
45
77
return result
46
78
workloadCE = result ["Value" ]
79
+ self .log .debug ("The CE interface has been set up" )
47
80
48
81
# Add the command in an executable file
49
- executable = self ._wrapCommand (command )
50
- # get inputs file from the current working directory
51
- inputs = os .listdir ("." )
82
+ executable = "workloadExec.sh"
83
+ self ._wrapCommand (command , workingDirectory , executable )
84
+ self .log .debug ("The command has been wrapped into an executable" )
85
+
86
+ # Get inputs from the current working directory
87
+ inputs = os .listdir (workingDirectory )
52
88
inputs .remove (os .path .basename (executable ))
53
89
self .log .verbose ("The executable will be sent along with the following inputs:" , "," .join (inputs ))
54
- # request the whole directory as output
90
+ # Request the whole directory as output
55
91
outputs = ["/" ]
56
92
57
93
# Submit the command as a job
58
94
result = workloadCE .submitJob (executable , workloadCE .proxy , inputs = inputs , outputs = outputs )
59
95
if not result ["OK" ]:
96
+ result ["Value" ] = (- 1 , "" , result ["Message" ])
60
97
return result
61
98
jobID = result ["Value" ][0 ]
62
99
stamp = result ["PilotStampDict" ][jobID ]
@@ -67,46 +104,83 @@ def execute(self, command):
67
104
time .sleep (120 )
68
105
result = workloadCE .getJobStatus ([jobID ])
69
106
if not result ["OK" ]:
107
+ result ["Value" ] = (- 1 , "" , result ["Message" ])
70
108
return result
71
109
jobStatus = result ["Value" ][jobID ]
72
110
self .log .verbose ("The final status of the application/script is: " , jobStatus )
73
111
74
112
# Get job outputs
75
113
result = workloadCE .getJobOutput ("%s:::%s" % (jobID , stamp ), os .path .abspath ("." ))
76
114
if not result ["OK" ]:
115
+ result ["Value" ] = (- 1 , "" , result ["Message" ])
77
116
return result
117
+ output , error = result ["Value" ]
118
+
119
+ # Clean job on the remote resource
120
+ if cleanRemoteJob :
121
+ result = workloadCE .cleanJob (jobID )
122
+ if not result ["OK" ]:
123
+ result ["Value" ] = (- 1 , "" , result ["Message" ])
124
+ return result
78
125
79
126
commandStatus = {"Done" : 0 , "Failed" : - 1 , "Killed" : - 2 }
80
- output , error = result ["Value" ]
81
- outputDict = {"OK" : True , "Value" : [commandStatus [jobStatus ], output , error ]}
82
- return outputDict
127
+ return S_OK ((commandStatus [jobStatus ], output , error ))
128
+
129
+ def _checkParameters (self ):
130
+ """Initialize the remote runner using the parameters of the CS.
131
+ :return: S_OK, S_ERROR
132
+ """
133
+ if not self ._workloadSite :
134
+ return S_ERROR ("The remote site is not defined" )
135
+ if not self ._workloadCE :
136
+ return S_ERROR ("The remote CE is not defined" )
137
+ if not self ._workloadQueue :
138
+ return S_ERROR ("The remote queue is not defined" )
83
139
84
- def _setUpworkloadCE (self ):
140
+ return S_OK ()
141
+
142
+ def _setUpWorkloadCE (self , numberOfProcessorsPayload = 1 ):
85
143
"""Get application queue and configure it
86
144
87
145
:return: a ComputingElement instance
88
146
"""
89
- # Get CE parameters
90
- workloadSite = gConfig .getValue ("/LocalSite/Site" )
91
- workloadCE = gConfig .getValue ("/LocalSite/GridCE" )
92
- workloadQueue = gConfig .getValue ("/LocalSite/CEQueue" )
93
-
94
- result = getQueue (workloadSite , workloadCE , workloadQueue )
147
+ # Get CE Parameters
148
+ result = getQueue (self ._workloadSite , self ._workloadCE , self ._workloadQueue )
95
149
if not result ["OK" ]:
96
150
return result
97
151
ceType = result ["Value" ]["CEType" ]
98
152
ceParams = result ["Value" ]
99
153
100
154
# Build CE
101
155
ceFactory = ComputingElementFactory ()
102
- result = ceFactory .getCE (ceName = workloadCE , ceType = ceType , ceParametersDict = ceParams )
156
+ result = ceFactory .getCE (ceName = self . _workloadCE , ceType = ceType , ceParametersDict = ceParams )
103
157
if not result ["OK" ]:
104
158
return result
105
159
workloadCE = result ["Value" ]
106
160
161
+ # Set the number of processors available according to the need of the payload
162
+ numberOfProcessorsCE = workloadCE .ceParameters .get ("NumberOfProcessors" , 1 )
163
+ if numberOfProcessorsCE < 1 or numberOfProcessorsPayload < 1 :
164
+ self .log .warn (
165
+ "Inappropriate values:" ,
166
+ "number of processors required for the payload %s - for the CE %s"
167
+ % (numberOfProcessorsPayload , numberOfProcessorsCE ),
168
+ )
169
+ return S_ERROR ("Inappropriate NumberOfProcessors value" )
170
+
171
+ if numberOfProcessorsPayload > numberOfProcessorsCE :
172
+ self .log .warn (
173
+ "Not enough processors to execute the payload: " ,
174
+ "number of processors required for the payload %s < %s the WN capacity"
175
+ % (numberOfProcessorsPayload , numberOfProcessorsCE ),
176
+ )
177
+ return S_ERROR ("Not enough processors to execute the command" )
178
+
179
+ workloadCE .ceParameters ["NumberOfProcessors" ] = numberOfProcessorsPayload
180
+
107
181
# Add a proxy to the CE
108
182
result = getProxyInfo ()
109
- if not result ["OK" ] and not result [ "Value" ][ "chain" ] :
183
+ if not result ["OK" ]:
110
184
return result
111
185
proxy = result ["Value" ]["chain" ]
112
186
result = proxy .getRemainingSecs ()
@@ -117,13 +191,36 @@ def _setUpworkloadCE(self):
117
191
118
192
return S_OK (workloadCE )
119
193
120
- def _wrapCommand (self , command ):
194
+ def _wrapCommand (self , command , workingDirectory , executable ):
121
195
"""Wrap the command in a file
122
196
123
197
:param str command: command line to write in the executable
124
- :return: name of the executable file
198
+ :param str workingDirectory: directory containing the inputs required by the command
199
+ :param str executable: path of the executable that should contain the command to submit
200
+ :return: path of the executable
125
201
"""
126
- executable = "workloadExec.sh"
202
+ # Check whether the command contains any absolute path: there would be no way to access them remotely
203
+ # They need to be converted into relative path
204
+ argumentsProcessed = []
205
+ for argument in shlex .split (command ):
206
+
207
+ argPath = os .path .dirname (argument )
208
+ # The argument does not contain any path, not concerned
209
+ if not argPath :
210
+ argumentsProcessed .append (argument )
211
+ continue
212
+
213
+ argPathAbsolutePath = os .path .abspath (argPath )
214
+ workingDirAbsolutePath = os .path .abspath (workingDirectory )
215
+ # The argument is not included in the workingDirectory, not concerned
216
+ if not argPathAbsolutePath .startswith (workingDirAbsolutePath ):
217
+ argumentsProcessed .append (argument )
218
+ continue
219
+
220
+ # The argument is included in the workingDirectory and should be converted
221
+ argumentsProcessed .append (os .path .join ("." , os .path .basename (argument )))
222
+
223
+ # Fro v8.0, use: shlex.join(argumentsProcessed)
224
+ command = " " .join (shlex_quote (arg ) for arg in argumentsProcessed )
127
225
with open (executable , "w" ) as f :
128
226
f .write (command )
129
- return executable
0 commit comments