Skip to content

Commit 557eb7b

Browse files
committed
fix: minor changes in HTCondor
1 parent 4d2d970 commit 557eb7b

File tree

3 files changed

+59
-60
lines changed

3 files changed

+59
-60
lines changed

src/DIRAC/Resources/Computing/BatchSystems/Condor.py

Lines changed: 45 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,10 @@
4545
error = $(Cluster).$(Process).err
4646
log = $(Cluster).$(Process).log
4747
48-
# Transfer all output files, even if the job is failed
48+
# No other files are to be transferred
4949
transfer_output_files = ""
50+
51+
# Transfer outputs, even if the job is failed
5052
should_transfer_files = YES
5153
when_to_transfer_output = ON_EXIT_OR_EVICT
5254
@@ -68,10 +70,10 @@
6870
# By default, HTCondor marked jobs as completed regardless of its status
6971
# This option allows to mark jobs as Held if they don't finish successfully
7072
on_exit_hold = ExitCode != 0
71-
# A random subcode to identify who put the job on hold
73+
# A subcode of our choice to identify who put the job on hold
7274
on_exit_hold_subcode = %(holdReasonSubcode)s
73-
# Jobs are then deleted from the system after N days if they are not running
74-
period_remove = (JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600))
75+
# Jobs are then deleted from the system after N days if they are not idle or running
76+
periodic_remove = (JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600))
7577
7678
# Specific options
7779
# ----------------
@@ -88,52 +90,60 @@
8890
def parseCondorStatus(lines, jobID):
8991
"""parse the condor_q or condor_history output for the job status
9092
91-
:param lines: list of lines from the output of the condor commands, each line is a pair of jobID, statusID, and holdReasonCode
93+
:param lines: list of lines from the output of the condor commands, each line is a tuple of jobID, statusID, and holdReasonCode
9294
:type lines: python:list
9395
:param str jobID: jobID of condor job, e.g.: 123.53
9496
:returns: Status as known by DIRAC, and a reason if the job is being held
9597
"""
9698
jobID = str(jobID)
9799

98100
holdReason = ""
101+
status = None
99102
for line in lines:
100103
l = line.strip().split()
101104

105+
# Make sure the job ID exists
106+
if len(l) < 1 or l[0] != jobID:
107+
continue
108+
102109
# Make sure the status is present and is an integer
103110
try:
104111
status = int(l[1])
105112
except (ValueError, IndexError):
106-
continue
113+
break
107114

108-
if l[0] == jobID:
109-
# A job can be held for many various reasons, we need to further investigate with the holdReasonCode & holdReasonSubCode
110-
# Details in:
111-
# https://htcondor.readthedocs.io/en/latest/classad-attributes/job-classad-attributes.html#HoldReasonCode
112-
if status == 5:
113-
114-
# By default, a held (5) job is defined as Aborted, but there might be some exceptions
115-
status = 3
116-
try:
117-
holdReasonCode = l[2]
118-
holdReasonSubcode = l[3]
119-
holdReason = " ".join(l[4:])
120-
except IndexError:
121-
# This should not happen in theory
122-
# Just set the status to unknown such as
123-
status = -1
124-
holdReasonCode = "undefined"
125-
holdReasonSubcode = "undefined"
126-
127-
# If holdReasonCode is 3 (The PERIODIC_HOLD expression evaluated to True. Or, ON_EXIT_HOLD was true)
128-
# And subcode is HOLD_REASON_SUBCODE, then it means the job failed by itself, it needs to be marked as Failed
129-
if holdReasonCode == "3" and holdReasonSubcode == HOLD_REASON_SUBCODE:
130-
status = 5
131-
# If holdReasonCode is 16 (Input files are being spooled), the job should be marked as Waiting
132-
elif holdReasonCode == "16":
133-
status = 1
134-
135-
return (STATES_MAP.get(status, "Unknown"), holdReason)
136-
return ("Unknown", holdReason)
115+
# Stop here if the status is not held (5): result should be found in STATES_MAP
116+
if status != 5:
117+
break
118+
119+
# A job can be held for various reasons,
120+
# we need to further investigate with the holdReasonCode & holdReasonSubCode
121+
# Details in:
122+
# https://htcondor.readthedocs.io/en/latest/classad-attributes/job-classad-attributes.html#HoldReasonCode
123+
124+
# By default, a held (5) job is defined as Aborted in STATES_MAP, but there might be some exceptions
125+
status = 3
126+
try:
127+
holdReasonCode = l[2]
128+
holdReasonSubcode = l[3]
129+
holdReason = " ".join(l[4:])
130+
except IndexError:
131+
# This should not happen in theory
132+
# Just set the status to unknown such as
133+
status = None
134+
holdReasonCode = "undefined"
135+
holdReasonSubcode = "undefined"
136+
break
137+
138+
# If holdReasonCode is 3 (The PERIODIC_HOLD expression evaluated to True. Or, ON_EXIT_HOLD was true)
139+
# And subcode is HOLD_REASON_SUBCODE, then it means the job failed by itself, it needs to be marked as Failed
140+
if holdReasonCode == "3" and holdReasonSubcode == HOLD_REASON_SUBCODE:
141+
status = 5
142+
# If holdReasonCode is 16 (Input files are being spooled), the job should be marked as Waiting
143+
elif holdReasonCode == "16":
144+
status = 1
145+
146+
return (STATES_MAP.get(status, "Unknown"), holdReason)
137147

138148

139149
class Condor(object):

src/DIRAC/Resources/Computing/HTCondorCEComputingElement.py

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,8 @@ def __init__(self, ceUniqueID):
110110
#############################################################################
111111

112112
def _DiracToCondorID(self, diracJobID):
113-
"""Convert a DIRAC jobID into an Condor jobID.
114-
Example: https://<ce>/1234/0 becomes 1234.0
113+
"""Convert a DIRAC jobID into a Condor jobID.
114+
Example: htcondorce://<ce>/1234.0 becomes 1234.0
115115
116116
:param str: DIRAC jobID
117117
:return: Condor jobID
@@ -128,7 +128,7 @@ def _condorToDiracID(self, condorJobIDs):
128128
129129
:param str condorJobIDs: the output of condor_submit
130130
131-
:return: job references such as htcondorce://<CE name>/<clusterID>.<i>
131+
:return: job references such as htcondorce://<ce>/<clusterID>.<i>
132132
"""
133133
clusterIDs = condorJobIDs.split("-")
134134
if len(clusterIDs) != 2:
@@ -179,7 +179,6 @@ def __writeSub(self, executable, location, processors, pilotStamps, tokenFile=No
179179

180180
# Remote schedd options by default
181181
targetUniverse = "vanilla"
182-
# This is used to remove outputs from the remote schedd
183182
scheddOptions = ""
184183
if self.useLocalSchedd:
185184
targetUniverse = "grid"
@@ -227,7 +226,7 @@ def _executeCondorCommand(self, cmd, keepTokenFile=False):
227226
228227
:param list cmd: list of the condor command elements
229228
:param bool keepTokenFile: flag to reuse or not the previously created token file
230-
:return: S_OK/S_ERROR - the result of the executeGridCommand() call
229+
:return: S_OK/S_ERROR - the stdout parameter of the executeGridCommand() call
231230
"""
232231
if not self.token and not self.proxy:
233232
return S_ERROR(f"Cannot execute the command, token and proxy not found: {cmd}")
@@ -406,8 +405,7 @@ def getJobStatus(self, jobIDList):
406405
# Get all condorIDs so we can just call condor_q and condor_history once
407406
for diracJobID in jobIDList:
408407
diracJobID = diracJobID.split(":::")[0]
409-
condorJobID = self._DiracToCondorID(diracJobID)
410-
condorIDs[diracJobID] = condorJobID
408+
condorIDs[diracJobID] = self._DiracToCondorID(diracJobID)
411409

412410
self.tokenFile = None
413411

@@ -422,8 +420,7 @@ def getJobStatus(self, jobIDList):
422420
if not result["OK"]:
423421
return result
424422

425-
_qList = result["Value"].split("\n")
426-
qList.extend(_qList)
423+
qList.extend(result["Value"].split("\n"))
427424

428425
condorHistCall = ["condor_history"]
429426
condorHistCall.extend(self.remoteScheddOptions.strip().split(" "))
@@ -433,21 +430,15 @@ def getJobStatus(self, jobIDList):
433430
if not result["OK"]:
434431
return result
435432

436-
_qList = result["Value"].split("\n")
437-
qList.extend(_qList)
433+
qList.extend(result["Value"].split("\n"))
438434

439-
jobsToCancel = []
440435
for job, jobID in condorIDs.items():
441-
pilotStatus, reason = parseCondorStatus(qList, jobID)
436+
jobStatus, reason = parseCondorStatus(qList, jobID)
442437

443-
if pilotStatus == PilotStatus.ABORTED:
444-
self.log.verbose("Held job", f"{jobID} because: {reason}")
445-
jobsToCancel.append(jobID)
438+
if jobStatus == PilotStatus.ABORTED:
439+
self.log.verbose("Job", f"{jobID} held: {reason}")
446440

447-
resultDict[job] = pilotStatus
448-
449-
# Make sure the pilot stays dead and gets taken out of the condor_q
450-
self.killJob(jobsToCancel)
441+
resultDict[job] = jobStatus
451442

452443
self.tokenFile = None
453444

@@ -480,7 +471,7 @@ def getJobOutput(self, jobID):
480471

481472
return S_OK((result["Value"]["output"], result["Value"]["error"]))
482473

483-
def _findFile(self, workingDir, fileName, pathToResult):
474+
def _findFile(self, fileName, pathToResult):
484475
"""Find a file in a file system.
485476
486477
:param str workingDir: the name of the directory containing the given file to search for
@@ -489,7 +480,7 @@ def _findFile(self, workingDir, fileName, pathToResult):
489480
490481
:return: path leading to the file
491482
"""
492-
path = os.path.join(workingDir, pathToResult, fileName)
483+
path = os.path.join(self.workingDirectory, pathToResult, fileName)
493484
if os.path.exists(path):
494485
return S_OK(path)
495486
return S_ERROR(errno.ENOENT, f"Could not find {path}")
@@ -535,7 +526,7 @@ def __getJobOutput(self, jobID, outTypes):
535526
outputsSuffix = {"output": "out", "error": "err", "logging": "log"}
536527
outputs = {}
537528
for output, suffix in outputsSuffix.items():
538-
resOut = self._findFile(self.workingDirectory, f"{condorJobID}.{suffix}", pathToResult)
529+
resOut = self._findFile(f"{condorJobID}.{suffix}", pathToResult)
539530
if not resOut["OK"]:
540531
# Return an error if the output type was targeted, else we continue
541532
if output in outTypes:

src/DIRAC/Resources/Computing/test/Test_HTCondorCEComputingElement.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ def test_parseCondorStatus():
6666
}
6767

6868
for jobID, expected in expectedResults.items():
69-
print(jobID, expected)
7069
assert HTCE.parseCondorStatus(statusLines, jobID)[0] == expected
7170

7271

@@ -103,7 +102,6 @@ def test_getJobStatus(mocker):
103102
"htcondorce://condorce.foo.arg/123.2": "Aborted",
104103
"htcondorce://condorce.foo.arg/333.3": "Unknown",
105104
}
106-
print(ret)
107105
assert ret["OK"] is True
108106
assert expectedResults == ret["Value"]
109107

0 commit comments

Comments
 (0)