Skip to content

Commit 4d2d970

Browse files
committed
fix: change periodic_remove condition and fix parseCondorStatus
1 parent 142b82f commit 4d2d970

File tree

3 files changed

+33
-24
lines changed

3 files changed

+33
-24
lines changed

src/DIRAC/Resources/Computing/BatchSystems/Condor.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
5: "Failed",
2424
}
2525

26-
HOLD_REASON_SUBCODE = 55
26+
HOLD_REASON_SUBCODE = "55"
2727

2828
subTemplate = """
2929
# Environment
@@ -71,7 +71,7 @@
7171
# A random subcode to identify who put the job on hold
7272
on_exit_hold_subcode = %(holdReasonSubcode)s
7373
# Jobs are then deleted from the system after N days if they are not running
74-
period_remove = (JobStatus != 2) && (time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600)
74+
period_remove = (JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600))
7575
7676
# Specific options
7777
# ----------------
@@ -94,13 +94,17 @@ def parseCondorStatus(lines, jobID):
9494
:returns: Status as known by DIRAC, and a reason if the job is being held
9595
"""
9696
jobID = str(jobID)
97+
98+
holdReason = ""
9799
for line in lines:
98100
l = line.strip().split()
101+
102+
# Make sure the status is present and is an integer
99103
try:
100104
status = int(l[1])
101105
except (ValueError, IndexError):
102106
continue
103-
holdReason = ""
107+
104108
if l[0] == jobID:
105109
# A job can be held for many various reasons, we need to further investigate with the holdReasonCode & holdReasonSubCode
106110
# Details in:
@@ -110,22 +114,22 @@ def parseCondorStatus(lines, jobID):
110114
# By default, a held (5) job is defined as Aborted, but there might be some exceptions
111115
status = 3
112116
try:
113-
holdReasonCode = int(l[2])
114-
holdReasonSubcode = int(l[3])
115-
holdReason = l[4:]
116-
except (ValueError, IndexError):
117+
holdReasonCode = l[2]
118+
holdReasonSubcode = l[3]
119+
holdReason = " ".join(l[4:])
120+
except IndexError:
117121
# This should not happen in theory
118122
# Just set the status to unknown such as
119123
status = -1
120-
holdReasonCode = -1
121-
holdReasonSubcode = -1
124+
holdReasonCode = "undefined"
125+
holdReasonSubcode = "undefined"
122126

123127
# If holdReasonCode is 3 (The PERIODIC_HOLD expression evaluated to True. Or, ON_EXIT_HOLD was true)
124128
# And subcode is HOLD_REASON_SUBCODE, then it means the job failed by itself, it needs to be marked as Failed
125-
if holdReasonCode == 3 and holdReasonSubcode == HOLD_REASON_SUBCODE:
129+
if holdReasonCode == "3" and holdReasonSubcode == HOLD_REASON_SUBCODE:
126130
status = 5
127131
# If holdReasonCode is 16 (Input files are being spooled), the job should be marked as Waiting
128-
elif holdReasonCode == 16:
132+
elif holdReasonCode == "16":
129133
status = 1
130134

131135
return (STATES_MAP.get(status, "Unknown"), holdReason)

src/DIRAC/Resources/Computing/HTCondorCEComputingElement.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def _executeCondorCommand(self, cmd, keepTokenFile=False):
272272
self.tokenFile = None
273273
# We have got a non-zero status code
274274
errorString = stderr if stderr else stdout
275-
return S_ERROR(f"Command", f"{cmd} failed with: {status} - {errorString.strip()}")
275+
return S_ERROR(f"Command {cmd} failed with: {status} - {errorString.strip()}")
276276

277277
# Remove token file if we do not want to keep it
278278
self.tokenFile = self.tokenFile if keepTokenFile else None
@@ -413,7 +413,7 @@ def getJobStatus(self, jobIDList):
413413

414414
qList = []
415415
for _condorIDs in breakListIntoChunks(condorIDs.values(), 100):
416-
# This will return a list of 1245.75 3
416+
# This will return a list of 1245.75 3 undefined undefined undefined
417417
cmd = ["condor_q"]
418418
cmd.extend(self.remoteScheddOptions.strip().split(" "))
419419
cmd.extend(_condorIDs)

src/DIRAC/Resources/Computing/test/Test_HTCondorCEComputingElement.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def test_parseCondorStatus():
6161
"foo": "Unknown",
6262
"104096.1": "Aborted",
6363
"104096.2": "Aborted",
64-
"104096.3": "Unknown",
64+
"104096.3": "Aborted",
6565
"104096.4": "Unknown",
6666
}
6767

@@ -73,7 +73,7 @@ def test_parseCondorStatus():
7373
def test_getJobStatus(mocker):
7474
"""Test HTCondorCE getJobStatus"""
7575
mocker.patch(
76-
MODNAME + ".HTCondorCEComputingElement._executeCondorCommand",
76+
MODNAME + ".executeGridCommand",
7777
side_effect=[
7878
S_OK((0, "\n".join(STATUS_LINES), "")),
7979
S_OK((0, "\n".join(HISTORY_LINES), "")),
@@ -82,8 +82,12 @@ def test_getJobStatus(mocker):
8282
],
8383
)
8484
mocker.patch(MODNAME + ".HTCondorCEComputingElement._HTCondorCEComputingElement__cleanup")
85+
mocker.patch(MODNAME + ".HTCondorCEComputingElement._prepareProxy", return_value=S_OK())
8586

8687
htce = HTCE.HTCondorCEComputingElement(12345)
88+
# Need to initialize proxy because it is required by executeCondorCommand()
89+
htce.proxy = "dumb_proxy"
90+
8791
ret = htce.getJobStatus(
8892
[
8993
"htcondorce://condorce.foo.arg/123.0:::abc321",
@@ -99,6 +103,7 @@ def test_getJobStatus(mocker):
99103
"htcondorce://condorce.foo.arg/123.2": "Aborted",
100104
"htcondorce://condorce.foo.arg/333.3": "Unknown",
101105
}
106+
print(ret)
102107
assert ret["OK"] is True
103108
assert expectedResults == ret["Value"]
104109

@@ -153,18 +158,18 @@ def test__writeSub(mocker, localSchedd, optionsNotExpected, optionsExpected):
153158

154159

155160
@pytest.mark.parametrize(
156-
"localSchedd, expected", [(False, "-pool condorce.cern.ch:9619 -name condorce.cern.ch"), (True, "")]
161+
"localSchedd, expected", [(False, "-pool condorce.cern.ch:9619 -name condorce.cern.ch "), (True, "")]
157162
)
158163
def test_reset(setUp, localSchedd, expected):
159164
ceParameters = setUp
160165

161166
htce = HTCE.HTCondorCEComputingElement(12345)
162167
htce.ceParameters = ceParameters
163-
htce.useLocalSchedd = True
168+
htce.useLocalSchedd = localSchedd
164169
ceName = "condorce.cern.ch"
165170
htce.ceName = ceName
166171
htce._reset()
167-
assert htce.remoteScheddOptions == ""
172+
assert htce.remoteScheddOptions == expected
168173

169174

170175
@pytest.mark.parametrize(
@@ -179,12 +184,12 @@ def test_submitJob(setUp, mocker, localSchedd, expected):
179184
htce = HTCE.HTCondorCEComputingElement(12345)
180185
htce.ceParameters = ceParameters
181186
htce.useLocalSchedd = localSchedd
187+
htce.proxy = "dumb_proxy"
182188
ceName = "condorce.cern.ch"
183189
htce.ceName = ceName
184190

185-
execMock = mocker.patch(
186-
MODNAME + ".HTCondorCEComputingElement._executeCondorCommand", return_value=S_OK((0, "123.0 - 123.0", ""))
187-
)
191+
execMock = mocker.patch(MODNAME + ".executeGridCommand", return_value=S_OK((0, "123.0 - 123.0", "")))
192+
mocker.patch(MODNAME + ".HTCondorCEComputingElement._prepareProxy", return_value=S_OK())
188193
mocker.patch(
189194
MODNAME + ".HTCondorCEComputingElement._HTCondorCEComputingElement__writeSub", return_value="dirac_pilot"
190195
)
@@ -212,13 +217,13 @@ def test_killJob(setUp, mocker, jobIDList, jobID, ret, success, local):
212217
ceParameters = setUp
213218
htce = HTCE.HTCondorCEComputingElement(12345)
214219
htce.ceName = "condorce.foo.arg"
220+
htce.proxy = "dumb_proxy"
215221
htce.useLocalSchedd = local
216222
htce.ceParameters = ceParameters
217223
htce._reset()
218224

219-
execMock = mocker.patch(
220-
MODNAME + ".HTCondorCEComputingElement._executeCondorCommand", return_value=S_OK((ret, "", ""))
221-
)
225+
execMock = mocker.patch(MODNAME + ".executeGridCommand", return_value=S_OK((ret, "", "")))
226+
mocker.patch(MODNAME + ".HTCondorCEComputingElement._prepareProxy", return_value=S_OK())
222227

223228
ret = htce.killJob(jobIDList=jobIDList)
224229
assert ret["OK"] == success

0 commit comments

Comments
 (0)