6
6
Mostly called by workflow modules, RemoteRunner is generally the last component to get through before
7
7
the script/application execution on a remote machine.
8
8
"""
9
+ import hashlib
9
10
import os
10
11
import shlex
11
12
import time
22
23
class RemoteRunner :
23
24
def __init__ (self , siteName = None , ceName = None , queueName = None ):
24
25
self .log = gLogger .getSubLogger ("RemoteRunner" )
26
+ self .executable = "workloadExec.sh"
27
+ self .checkSumOutput = "md5Checksum.txt"
28
+
25
29
self ._workloadSite = siteName
26
30
if not self ._workloadSite :
27
31
self .log .warn ("You are expected to provide a siteName in parameters from v8.0" )
@@ -61,44 +65,44 @@ def execute(self, command, workingDirectory=".", numberOfProcessors=1, cleanRemo
61
65
self .log .verbose ("Command to submit:" , command )
62
66
63
67
# Check whether CE parameters are set
64
- result = self ._checkParameters ()
65
- if not result ["OK" ]:
68
+ if not (result := self ._checkParameters ())["OK" ]:
66
69
result ["Errno" ] = DErrno .ESECTION
67
70
return result
68
- self .log .verbose (
69
- "The command will be sent to" ,
71
+ self .log .info (
72
+ "Preparing and submitting the command to" ,
70
73
f"site { self ._workloadSite } , CE { self ._workloadCE } , queue { self ._workloadQueue } " ,
71
74
)
72
75
73
76
# Set up Application Queue
74
- result = self ._setUpWorkloadCE (numberOfProcessors )
75
- if not result ["OK" ]:
77
+ if not (result := self ._setUpWorkloadCE (numberOfProcessors ))["OK" ]:
76
78
result ["Errno" ] = DErrno .ERESUNA
77
79
return result
78
80
workloadCE = result ["Value" ]
79
81
self .log .debug ("The CE interface has been set up" )
80
82
81
83
# Add the command in an executable file
82
- executable = "workloadExec.sh"
83
- self ._wrapCommand (command , workingDirectory , executable )
84
+ self ._wrapCommand (command , workingDirectory )
84
85
self .log .debug ("The command has been wrapped into an executable" )
85
86
86
87
# Get inputs from the current working directory
87
88
inputs = os .listdir (workingDirectory )
88
- inputs .remove (os .path .basename (executable ))
89
+ inputs .remove (os .path .basename (self . executable ))
89
90
self .log .verbose ("The executable will be sent along with the following inputs:" , "," .join (inputs ))
90
91
# Request the whole directory as output
91
92
outputs = ["/" ]
92
93
93
94
# Submit the command as a job
94
- result = workloadCE .submitJob (executable , workloadCE .proxy , inputs = inputs , outputs = outputs )
95
- if not result ["OK" ]:
95
+ if not (result := workloadCE .submitJob (self .executable , workloadCE .proxy , inputs = inputs , outputs = outputs ))[
96
+ "OK"
97
+ ]:
96
98
result ["Errno" ] = DErrno .EWMSSUBM
97
99
return result
98
100
jobID = result ["Value" ][0 ]
99
101
stamp = result ["PilotStampDict" ][jobID ]
102
+ self .log .info ("The command has been wrapped in a job and sent. Remote JobID: " , jobID )
100
103
101
104
# Get status of the job
105
+ self .log .info ("Waiting for the end of the job..." )
102
106
jobStatus = PilotStatus .RUNNING
103
107
while jobStatus not in PilotStatus .PILOT_FINAL_STATES :
104
108
time .sleep (120 )
@@ -107,20 +111,27 @@ def execute(self, command, workingDirectory=".", numberOfProcessors=1, cleanRemo
107
111
result ["Errno" ] = DErrno .EWMSSTATUS
108
112
return result
109
113
jobStatus = result ["Value" ][jobID ]
110
- self .log .verbose ("The final status of the application/script is: " , jobStatus )
114
+ self .log .info ("The final status of the application/script is: " , jobStatus )
111
115
112
116
# Get job outputs
113
- result = workloadCE . getJobOutput ( f" { jobID } ::: { stamp } " , os . path . abspath ( "." ) )
114
- if not result ["OK" ]:
117
+ self . log . info ( "Getting the outputs of the command..." )
118
+ if not ( result := workloadCE . getJobOutput ( f" { jobID } ::: { stamp } " , os . path . abspath ( "." ))) ["OK" ]:
115
119
result ["Errno" ] = DErrno .EWMSJMAN
116
120
return result
117
121
output , error = result ["Value" ]
118
122
123
+ # Make sure the output is correct
124
+ self .log .info ("Checking the integrity of the outputs..." )
125
+ if not (result := self ._checkOutputIntegrity ("." ))["OK" ]:
126
+ result ["Errno" ] = DErrno .EWMSJMAN
127
+ return result
128
+ self .log .info ("The output has been retrieved and declared complete" )
129
+
119
130
# Clean job in the remote resource
120
131
if cleanRemoteJob :
121
- result = workloadCE .cleanJob (jobID )
122
- if not result ["OK" ]:
132
+ if not (result := workloadCE .cleanJob (jobID ))["OK" ]:
123
133
self .log .warn ("Failed to clean the output remotely" , result ["Message" ])
134
+ self .log .info ("The job has been remotely removed" )
124
135
125
136
commandStatus = {"Done" : 0 , "Failed" : - 1 , "Killed" : - 2 }
126
137
return S_OK ((commandStatus [jobStatus ], output , error ))
@@ -190,12 +201,11 @@ def _setUpWorkloadCE(self, numberOfProcessorsPayload=1):
190
201
191
202
return S_OK (workloadCE )
192
203
193
- def _wrapCommand (self , command , workingDirectory , executable ):
204
+ def _wrapCommand (self , command , workingDirectory ):
194
205
"""Wrap the command in a file
195
206
196
207
:param str command: command line to write in the executable
197
208
:param str workingDirectory: directory containing the inputs required by the command
198
- :param str executable: path of the executable that should contain the command to submit
199
209
:return: path of the executable
200
210
"""
201
211
# Check whether the command contains any absolute path: there would be no way to access them remotely
@@ -219,5 +229,34 @@ def _wrapCommand(self, command, workingDirectory, executable):
219
229
argumentsProcessed .append (os .path .join ("." , os .path .basename (argument )))
220
230
221
231
command = shlex .join (argumentsProcessed )
222
- with open (executable , "w" ) as f :
232
+ with open (self . executable , "w" ) as f :
223
233
f .write (command )
234
+ # Post-processing: compute the checksum of the outputs
235
+ f .write (f"\n md5sum * > { self .checkSumOutput } " )
236
+
237
+ def _checkOutputIntegrity (self , workingDirectory ):
238
+ """Make sure that output files are not corrupted.
239
+
240
+ :param str workingDirectory: path of the outputs
241
+ """
242
+ checkSumOutput = os .path .join (workingDirectory , self .checkSumOutput )
243
+ if not os .path .exists (checkSumOutput ):
244
+ return S_ERROR (f"Cannot guarantee the integrity of the outputs: { checkSumOutput } unavailable" )
245
+
246
+ with open (checkSumOutput ) as f :
247
+ # for each output file, compute the md5 checksum
248
+ for line in f :
249
+ checkSum , remoteOutput = list (filter (None , line .strip ("\n " ).split (" " )))
250
+
251
+ hash = hashlib .md5 ()
252
+ localOutput = os .path .join (workingDirectory , remoteOutput )
253
+ if not os .path .exists (localOutput ):
254
+ return S_ERROR (f"{ localOutput } was expected but not found" )
255
+
256
+ with open (localOutput , "rb" ) as f :
257
+ while chunk := f .read (128 * hash .block_size ):
258
+ hash .update (chunk )
259
+ if checkSum != hash .hexdigest ():
260
+ return S_ERROR (f"{ localOutput } is corrupted" )
261
+
262
+ return S_OK ()
0 commit comments