Skip to content

Commit b65cb3d

Browse files
authored
Merge pull request #6963 from aldbr/rel-v7r3_FIX_RemoteRunner
[v7r3] PushJobAgent: fixes and features
2 parents 784f8d8 + 06592b5 commit b65cb3d

File tree

9 files changed

+397
-134
lines changed

9 files changed

+397
-134
lines changed

docs/source/AdministratorGuide/Resources/supercomputers.rst

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,14 +133,24 @@ One has also to authorize the machine hosting the :mod:`~DIRAC.WorkloadManagemen
133133
Properties += GenericPilot
134134
Properties += FileCatalogManagement
135135

136-
One has to specify the concerned VO in the targeted CEs, such as::
136+
One has to specify the concerned VO, the platform and the CPU Power in the targeted CEs as well as , such as::
137137

138138
<CE>
139139
{
140140
# To match a <VO> job
141141
VO = <VO>
142142
# Required because we are on a host (not on a worker node)
143143
VirtualOrganization = <VO>
144+
# To match compatible jobs
145+
Platform = <platform>
146+
Queues
147+
{
148+
<Queue>
149+
{
150+
CPUNormalizationFactor = <CPU Power value>
151+
}
152+
}
153+
144154
}
145155

146156
Finally, one has to make sure that job scheduling parameters are correctly fine-tuned. Further details in the :ref:`JobScheduling section <jobscheduling>`.

src/DIRAC/Resources/Computing/ARC6ComputingElement.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def submitJob(self, executableFile, proxy, numberOfJobs=1):
144144
jobdescs = arc.JobDescriptionList()
145145

146146
# Get the job into the ARC way
147-
xrslString, diracStamp = self._writeXRSL(executableFile)
147+
xrslString, diracStamp = self._writeXRSL(executableFile, [], [])
148148
self.log.debug("XRSL string submitted : %s" % xrslString)
149149
self.log.debug("DIRAC stamp for job : %s" % diracStamp)
150150

src/DIRAC/Resources/Computing/ARCComputingElement.py

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -167,13 +167,12 @@ def _addCEConfigDefaults(self):
167167
ComputingElement._addCEConfigDefaults(self)
168168

169169
#############################################################################
170-
def _writeXRSL(self, executableFile, inputs=None, outputs=None, executables=None):
170+
def _writeXRSL(self, executableFile, inputs, outputs):
171171
"""Create the JDL for submission
172172
173173
:param str executableFile: executable to wrap in a XRSL file
174-
:param str/list inputs: path of the dependencies to include along with the executable
175-
:param str/list outputs: path of the outputs that we want to get at the end of the execution
176-
:param str/list executables: path to inputs that should have execution mode on the remote worker node
174+
:param list inputs: path of the dependencies to include along with the executable
175+
:param list outputs: path of the outputs that we want to get at the end of the execution
177176
"""
178177
diracStamp = makeGuid()[:8]
179178
# Evaluate the number of processors to allocate
@@ -191,34 +190,25 @@ def _writeXRSL(self, executableFile, inputs=None, outputs=None, executables=None
191190
"xrslMPExtraString": self.xrslMPExtraString,
192191
}
193192

194-
# Files that would need execution rights on the remote worker node
195-
xrslExecutables = ""
196-
if executables:
197-
if not isinstance(executables, list):
198-
executables = [executables]
199-
xrslExecutables = "(executables=%s)" % " ".join(map(os.path.basename, executables))
200-
# Add them to the inputFiles
201-
if not inputs:
202-
inputs = []
203-
if not isinstance(inputs, list):
204-
inputs = [inputs]
205-
inputs += executables
206-
207193
# Dependencies that have to be embedded along with the executable
208194
xrslInputs = ""
209-
if inputs:
210-
if not isinstance(inputs, list):
211-
inputs = [inputs]
212-
for inputFile in inputs:
213-
xrslInputs += '(%s "%s")' % (os.path.basename(inputFile), inputFile)
195+
executables = []
196+
for inputFile in inputs:
197+
inputFileBaseName = os.path.basename(inputFile)
198+
if os.access(inputFile, os.X_OK):
199+
# Files that would need execution rights on the remote worker node
200+
executables.append(inputFileBaseName)
201+
xrslInputs += '(%s "%s")' % (inputFileBaseName, inputFile)
202+
203+
# Executables are added to the XRSL
204+
xrslExecutables = ""
205+
if executables:
206+
xrslExecutables = "(executables=%s)" % " ".join(executables)
214207

215208
# Output files to retrieve once the execution is complete
216209
xrslOutputs = '("%s.out" "") ("%s.err" "")' % (diracStamp, diracStamp)
217-
if outputs:
218-
if not isinstance(outputs, list):
219-
outputs = [outputs]
220-
for outputFile in outputs:
221-
xrslOutputs += '(%s "")' % (outputFile)
210+
for outputFile in outputs:
211+
xrslOutputs += '(%s "")' % (outputFile)
222212

223213
xrsl = """
224214
&(executable="%(executable)s")
@@ -247,6 +237,13 @@ def _writeXRSL(self, executableFile, inputs=None, outputs=None, executables=None
247237
def _bundlePreamble(self, executableFile):
248238
"""Bundle the preamble with the executable file"""
249239
wrapperContent = "%s\n./%s" % (self.preamble, executableFile)
240+
241+
# We need to make sure the executable file can be executed by the wrapper
242+
# By adding the execution mode to the file, the file will be processed as an "executable" in the XRSL
243+
# This is done in _writeXRSL()
244+
if not os.access(executableFile, os.X_OK):
245+
os.chmod(executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH)
246+
250247
return writeScript(wrapperContent, os.getcwd())
251248

252249
#############################################################################
@@ -299,13 +296,14 @@ def submitJob(self, executableFile, proxy, numberOfJobs=1, inputs=None, outputs=
299296
return result
300297
self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"])
301298

302-
self.log.verbose("Executable file path: %s" % executableFile)
303-
if not os.access(executableFile, 5):
304-
os.chmod(executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH)
299+
if not inputs:
300+
inputs = []
301+
if not outputs:
302+
outputs = []
305303

306-
executables = None
304+
self.log.verbose("Executable file path: %s" % executableFile)
307305
if self.preamble:
308-
executables = [executableFile]
306+
inputs.append(executableFile)
309307
executableFile = self._bundlePreamble(executableFile)
310308

311309
batchIDList = []
@@ -325,7 +323,7 @@ def submitJob(self, executableFile, proxy, numberOfJobs=1, inputs=None, outputs=
325323
# The basic job description
326324
jobdescs = arc.JobDescriptionList()
327325
# Get the job into the ARC way
328-
xrslString, diracStamp = self._writeXRSL(executableFile, inputs, outputs, executables)
326+
xrslString, diracStamp = self._writeXRSL(executableFile, inputs, outputs)
329327
self.log.debug("XRSL string submitted : %s" % xrslString)
330328
self.log.debug("DIRAC stamp for job : %s" % diracStamp)
331329
# The arc bindings don't accept unicode objects in Python 2 so xrslString must be explicitly cast

src/DIRAC/Resources/Computing/AREXComputingElement.py

Lines changed: 91 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -304,13 +304,12 @@ def _getDelegationID(self, arcJobID):
304304

305305
#############################################################################
306306

307-
def _getArcJobID(self, executableFile, inputs, outputs, executables, delegation):
307+
def _getArcJobID(self, executableFile, inputs, outputs, delegation):
308308
"""Get an ARC JobID endpoint to upload executables and inputs.
309309
310310
:param str executableFile: executable to submit
311311
:param list inputs: list of input files
312312
:param list outputs: list of expected output files
313-
:param list executables: list of secondary executables (will be uploaded with the executable mode)
314313
:param str delegation: delegation ID
315314
316315
:return: tuple containing a job ID and a stamp
@@ -320,7 +319,7 @@ def _getArcJobID(self, executableFile, inputs, outputs, executables, delegation)
320319
query = self._urlJoin("jobs")
321320

322321
# Get the job into the ARC way
323-
xrslString, diracStamp = self._writeXRSL(executableFile, inputs, outputs, executables)
322+
xrslString, diracStamp = self._writeXRSL(executableFile, inputs, outputs)
324323
xrslString += delegation
325324
self.log.debug("XRSL string submitted", "is %s" % xrslString)
326325
self.log.debug("DIRAC stamp for job", "is %s" % diracStamp)
@@ -344,21 +343,16 @@ def _getArcJobID(self, executableFile, inputs, outputs, executables, delegation)
344343
arcJobID = responseJob["id"]
345344
return S_OK((arcJobID, diracStamp))
346345

347-
def _uploadJobDependencies(self, arcJobID, executableFile, inputs, executables):
346+
def _uploadJobDependencies(self, arcJobID, executableFile, inputs):
348347
"""Upload job dependencies so that the job can start.
349348
This includes the executables and the inputs.
350349
351350
:param str arcJobID: ARC job ID
352351
:param str executableFile: executable file
353352
:param list inputs: inputs required by the executable file
354-
:param list executables: executables require by the executable file
355353
"""
356354
filesToSubmit = [executableFile]
357-
filesToSubmit += executables
358-
if inputs:
359-
if not isinstance(inputs, list):
360-
inputs = [inputs]
361-
filesToSubmit += inputs
355+
filesToSubmit += inputs
362356

363357
for fileToSubmit in filesToSubmit:
364358
queryExecutable = self._urlJoin(os.path.join("jobs", arcJobID, "session", os.path.basename(fileToSubmit)))
@@ -376,32 +370,6 @@ def _uploadJobDependencies(self, arcJobID, executableFile, inputs, executables):
376370
self.log.verbose("Input correctly uploaded", fileToSubmit)
377371
return S_OK()
378372

379-
def _killJob(self, arcJobList):
380-
"""Kill the specified jobs
381-
382-
:param list arcJobList: list of ARC Job IDs
383-
"""
384-
result = self._checkSession()
385-
if not result["OK"]:
386-
self.log.error("Cannot kill jobs", result["Message"])
387-
return result
388-
389-
# List of jobs in json format for the REST query
390-
jobsJson = {"job": [{"id": job} for job in arcJobList]}
391-
392-
# Prepare the command
393-
params = {"action": "kill"}
394-
query = self._urlJoin("jobs")
395-
396-
# Killing jobs should be fast
397-
result = self._request("post", query, params=params, data=json.dumps(jobsJson))
398-
if not result["OK"]:
399-
self.log.error("Failed to kill all these jobs.", result["Message"])
400-
return S_ERROR("Failed to kill all these jobs")
401-
402-
self.log.debug("Successfully deleted jobs")
403-
return S_OK()
404-
405373
def submitJob(self, executableFile, proxy, numberOfJobs=1, inputs=None, outputs=None):
406374
"""Method to submit job
407375
Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue>
@@ -423,10 +391,14 @@ def submitJob(self, executableFile, proxy, numberOfJobs=1, inputs=None, outputs=
423391
else:
424392
delegation = "\n(delegationid=%s)" % result["Value"]
425393

394+
if not inputs:
395+
inputs = []
396+
if not outputs:
397+
outputs = []
398+
426399
# If there is a preamble, then we bundle it in an executable file
427-
executables = []
428400
if self.preamble:
429-
executables = [executableFile]
401+
inputs.append(executableFile)
430402
executableFile = self._bundlePreamble(executableFile)
431403

432404
# Submit multiple jobs sequentially.
@@ -436,14 +408,14 @@ def submitJob(self, executableFile, proxy, numberOfJobs=1, inputs=None, outputs=
436408
batchIDList = []
437409
stampDict = {}
438410
for _ in range(numberOfJobs):
439-
result = self._getArcJobID(executableFile, inputs, outputs, executables, delegation)
411+
result = self._getArcJobID(executableFile, inputs, outputs, delegation)
440412
if not result["OK"]:
441413
break
442414
arcJobID, diracStamp = result["Value"]
443415

444416
# At this point, only the XRSL job has been submitted to AREX services
445417
# Here we also upload the executable, other executable files and inputs.
446-
result = self._uploadJobDependencies(arcJobID, executableFile, inputs, executables)
418+
result = self._uploadJobDependencies(arcJobID, executableFile, inputs)
447419
if not result["OK"]:
448420
break
449421

@@ -469,12 +441,83 @@ def killJob(self, jobIDList):
469441
470442
:param list jobIDList: list of DIRAC Job IDs
471443
"""
444+
if not isinstance(jobIDList, list):
445+
jobIDList = [jobIDList]
472446
self.log.debug("Killing jobs", ",".join(jobIDList))
473447

474-
# List of jobs in json format for the REST query
475-
jList = [self._DiracToArcID(job) for job in jobIDList]
448+
# Convert DIRAC jobs to ARC jobs
449+
# DIRAC Jobs might be stored with a DIRAC stamp (":::XXXXX") that should be removed
450+
jList = [self._DiracToArcID(job.split(":::")[0]) for job in jobIDList]
476451
return self._killJob(jList)
477452

453+
def _killJob(self, arcJobList):
454+
"""Kill the specified jobs
455+
456+
:param list arcJobList: list of ARC Job IDs
457+
"""
458+
result = self._checkSession()
459+
if not result["OK"]:
460+
self.log.error("Cannot kill jobs", result["Message"])
461+
return result
462+
463+
# List of jobs in json format for the REST query
464+
jobsJson = {"job": [{"id": job} for job in arcJobList]}
465+
466+
# Prepare the command
467+
params = {"action": "kill"}
468+
query = self._urlJoin("jobs")
469+
470+
# Killing jobs should be fast
471+
result = self._request("post", query, params=params, data=json.dumps(jobsJson))
472+
if not result["OK"]:
473+
self.log.error("Failed to kill all these jobs.", result["Message"])
474+
return S_ERROR("Failed to kill all these jobs")
475+
476+
self.log.debug("Successfully deleted jobs")
477+
return S_OK()
478+
479+
#############################################################################
480+
481+
def cleanJob(self, jobIDList):
482+
"""Clean files related to the specified jobs
483+
484+
:param list jobIDList: list of DIRAC Job IDs
485+
"""
486+
if not isinstance(jobIDList, list):
487+
jobIDList = [jobIDList]
488+
self.log.debug("Cleaning jobs", ",".join(jobIDList))
489+
490+
# Convert DIRAC jobs to ARC jobs
491+
# DIRAC Jobs might be stored with a DIRAC stamp (":::XXXXX") that should be removed
492+
jList = [self._DiracToArcID(job.split(":::")[0]) for job in jobIDList]
493+
return self._cleanJob(jList)
494+
495+
def _cleanJob(self, arcJobList):
496+
"""Clean files related to the specified jobs
497+
498+
:param list jobIDList: list of ARC Job IDs
499+
"""
500+
result = self._checkSession()
501+
if not result["OK"]:
502+
self.log.error("Cannot clean jobs", result["Message"])
503+
return result
504+
505+
# List of jobs in json format for the REST query
506+
jobsJson = {"job": [{"id": job} for job in arcJobList]}
507+
508+
# Prepare the command
509+
params = {"action": "clean"}
510+
query = self._urlJoin("jobs")
511+
512+
# Cleaning jobs
513+
result = self._request("post", query, params=params, data=json.dumps(jobsJson))
514+
if not result["OK"]:
515+
self.log.error("Failed to clean all these jobs.", result["Message"])
516+
return S_ERROR("Failed to clean all these jobs")
517+
518+
self.log.debug("Successfully cleaned jobs")
519+
return S_OK()
520+
478521
#############################################################################
479522

480523
def getCEStatus(self):
@@ -613,14 +656,10 @@ def getJobStatus(self, jobIDList):
613656
if not isinstance(jobIDList, list):
614657
jobIDList = [jobIDList]
615658

616-
# Jobs are stored with a DIRAC stamp (":::XXXXX") appended
617-
jobList = []
618-
for j in jobIDList:
619-
job = j.split(":::")[0]
620-
jobList.append(job)
621-
622-
self.log.debug("Getting status of jobs : %s" % jobList)
623-
arcJobsJson = {"job": [{"id": self._DiracToArcID(job)} for job in jobList]}
659+
self.log.debug("Getting status of jobs:", jobIDList)
660+
# Convert DIRAC jobs to ARC jobs and encapsulate them in a dictionary for the REST query
661+
# DIRAC Jobs might be stored with a DIRAC stamp (":::XXXXX") that should be removed
662+
arcJobsJson = {"job": [{"id": self._DiracToArcID(job.split(":::")[0])} for job in jobIDList]}
624663

625664
# Prepare the command
626665
params = {"action": "status"}
@@ -688,12 +727,8 @@ def getJobLog(self, jobID):
688727
self.log.error("Cannot get job logging info", result["Message"])
689728
return result
690729

691-
# Extract stamp from the Job ID
692-
if ":::" in jobID:
693-
jobID = jobID.split(":::")[0]
694-
695730
# Prepare the command: Get output files
696-
arcJob = self._DiracToArcID(jobID)
731+
arcJob = self._DiracToArcID(jobID.split(":::")[0])
697732
query = self._urlJoin(os.path.join("jobs", arcJob, "diagnose", "errors"))
698733

699734
# Submit the GET request to retrieve outputs
@@ -759,9 +794,9 @@ def getJobOutput(self, jobID, workingDirectory=None):
759794
remoteOutputs = result["Value"]
760795
self.log.debug("Outputs to get are", remoteOutputs)
761796

762-
# We assume that workingDirectory exists
763797
if not workingDirectory:
764798
if "WorkingDirectory" in self.ceParameters:
799+
# We assume that workingDirectory exists
765800
workingDirectory = os.path.join(self.ceParameters["WorkingDirectory"], job)
766801
else:
767802
workingDirectory = job

0 commit comments

Comments
 (0)