6
6
Mostly called by workflow modules, RemoteRunner is generally the last component to get through before
7
7
the script/application execution on a remote machine.
8
8
"""
9
+ import hashlib
9
10
import os
10
11
import shlex
11
12
import time
22
23
class RemoteRunner :
23
24
def __init__ (self , siteName = None , ceName = None , queueName = None ):
24
25
self .log = gLogger .getSubLogger ("RemoteRunner" )
26
+ self .executable = "workloadExec.sh"
27
+ self .checkSumOutput = "md5Checksum.txt"
28
+
25
29
self ._workloadSite = siteName
26
30
if not self ._workloadSite :
27
31
self .log .warn ("You are expected to provide a siteName in parameters from v8.0" )
@@ -65,8 +69,8 @@ def execute(self, command, workingDirectory=".", numberOfProcessors=1, cleanRemo
65
69
if not result ["OK" ]:
66
70
result ["Errno" ] = DErrno .ESECTION
67
71
return result
68
- self .log .verbose (
69
- "The command will be sent to" ,
72
+ self .log .info (
73
+ "Preparing and submitting the command to" ,
70
74
f"site { self ._workloadSite } , CE { self ._workloadCE } , queue { self ._workloadQueue } " ,
71
75
)
72
76
@@ -79,26 +83,27 @@ def execute(self, command, workingDirectory=".", numberOfProcessors=1, cleanRemo
79
83
self .log .debug ("The CE interface has been set up" )
80
84
81
85
# Add the command in an executable file
82
- executable = "workloadExec.sh"
83
- self ._wrapCommand (command , workingDirectory , executable )
86
+ self ._wrapCommand (command , workingDirectory )
84
87
self .log .debug ("The command has been wrapped into an executable" )
85
88
86
89
# Get inputs from the current working directory
87
90
inputs = os .listdir (workingDirectory )
88
- inputs .remove (os .path .basename (executable ))
91
+ inputs .remove (os .path .basename (self . executable ))
89
92
self .log .verbose ("The executable will be sent along with the following inputs:" , "," .join (inputs ))
90
93
# Request the whole directory as output
91
94
outputs = ["/" ]
92
95
93
96
# Submit the command as a job
94
- result = workloadCE .submitJob (executable , workloadCE .proxy , inputs = inputs , outputs = outputs )
97
+ result = workloadCE .submitJob (self . executable , workloadCE .proxy , inputs = inputs , outputs = outputs )
95
98
if not result ["OK" ]:
96
99
result ["Errno" ] = DErrno .EWMSSUBM
97
100
return result
98
101
jobID = result ["Value" ][0 ]
99
102
stamp = result ["PilotStampDict" ][jobID ]
103
+ self .log .info ("The command has been wrapped in a job and sent. Remote JobID: " , jobID )
100
104
101
105
# Get status of the job
106
+ self .log .info ("Waiting for the end of the job..." )
102
107
jobStatus = PilotStatus .RUNNING
103
108
while jobStatus not in PilotStatus .PILOT_FINAL_STATES :
104
109
time .sleep (120 )
@@ -107,20 +112,30 @@ def execute(self, command, workingDirectory=".", numberOfProcessors=1, cleanRemo
107
112
result ["Errno" ] = DErrno .EWMSSTATUS
108
113
return result
109
114
jobStatus = result ["Value" ][jobID ]
110
- self .log .verbose ("The final status of the application/script is: " , jobStatus )
115
+ self .log .info ("The final status of the application/script is: " , jobStatus )
111
116
112
117
# Get job outputs
118
+ self .log .info ("Getting the outputs of the command..." )
113
119
result = workloadCE .getJobOutput (f"{ jobID } :::{ stamp } " , os .path .abspath ("." ))
114
120
if not result ["OK" ]:
115
121
result ["Errno" ] = DErrno .EWMSJMAN
116
122
return result
117
123
output , error = result ["Value" ]
118
124
125
+ # Make sure the output is correct
126
+ self .log .info ("Checking the integrity of the outputs..." )
127
+ result = self ._checkOutputIntegrity ("." )
128
+ if not result ["OK" ]:
129
+ result ["Errno" ] = DErrno .EWMSJMAN
130
+ return result
131
+ self .log .info ("The output has been retrieved and declared complete" )
132
+
119
133
# Clean job in the remote resource
120
134
if cleanRemoteJob :
121
135
result = workloadCE .cleanJob (jobID )
122
136
if not result ["OK" ]:
123
137
self .log .warn ("Failed to clean the output remotely" , result ["Message" ])
138
+ self .log .info ("The job has been remotely removed" )
124
139
125
140
commandStatus = {"Done" : 0 , "Failed" : - 1 , "Killed" : - 2 }
126
141
return S_OK ((commandStatus [jobStatus ], output , error ))
@@ -190,12 +205,11 @@ def _setUpWorkloadCE(self, numberOfProcessorsPayload=1):
190
205
191
206
return S_OK (workloadCE )
192
207
193
- def _wrapCommand (self , command , workingDirectory , executable ):
208
+ def _wrapCommand (self , command , workingDirectory ):
194
209
"""Wrap the command in a file
195
210
196
211
:param str command: command line to write in the executable
197
212
:param str workingDirectory: directory containing the inputs required by the command
198
- :param str executable: path of the executable that should contain the command to submit
199
213
:return: path of the executable
200
214
"""
201
215
# Check whether the command contains any absolute path: there would be no way to access them remotely
@@ -219,5 +233,35 @@ def _wrapCommand(self, command, workingDirectory, executable):
219
233
argumentsProcessed .append (os .path .join ("." , os .path .basename (argument )))
220
234
221
235
command = shlex .join (argumentsProcessed )
222
- with open (executable , "w" ) as f :
236
+ with open (self . executable , "w" ) as f :
223
237
f .write (command )
238
+ # Post-processing: compute the checksum of the outputs
239
+ f .write (f"\n md5sum * > { self .checkSumOutput } " )
240
+
241
+ def _checkOutputIntegrity (self , workingDirectory ):
242
+ """Make sure that output files are not corrupted.
243
+
244
+ :param str workingDirectory: path of the outputs
245
+ """
246
+ checkSumOutput = os .path .join (workingDirectory , self .checkSumOutput )
247
+ if not os .path .exists (checkSumOutput ):
248
+ return S_ERROR (f"Cannot guarantee the integrity of the outputs: { checkSumOutput } unavailable" )
249
+
250
+ with open (checkSumOutput ) as f :
251
+ # for each output file, compute the md5 checksum
252
+ for line in f :
253
+ checkSum , remoteOutput = list (filter (None , line .strip ("\n " ).split (" " )))
254
+
255
+ hash = hashlib .md5 ()
256
+ localOutput = os .path .join (workingDirectory , remoteOutput )
257
+ if not os .path .exists (localOutput ):
258
+ return S_ERROR (f"{ localOutput } was expected but not found" )
259
+
260
+ with open (localOutput , "rb" ) as f :
261
+ while chunk := f .read (128 * hash .block_size ):
262
+ hash .update (chunk )
263
+ if checkSum != hash .hexdigest ():
264
+ print (hash .hexdigest ())
265
+ return S_ERROR (f"{ localOutput } is corrupted" )
266
+
267
+ return S_OK ()
0 commit comments