Skip to content

Commit d07be70

Browse files
committed
adding test for slurm rescheduling
1 parent 0b9734c commit d07be70

File tree

2 files changed

+62
-4
lines changed

2 files changed

+62
-4
lines changed

pydra/engine/tests/test_submitter.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from .utils import gen_basic_wf
1010
from ..core import Workflow
11+
from ..task import ShellCommandTask
1112
from ..submitter import Submitter
1213
from ... import mark
1314

@@ -261,3 +262,58 @@ def test_slurm_args_2(tmpdir):
261262
with pytest.raises(RuntimeError, match="Error returned from sbatch:"):
262263
with Submitter("slurm", sbatch_args="-N1 --invalid") as sub:
263264
sub(task)
265+
266+
267+
@pytest.mark.skipif(not slurm_available, reason="slurm not installed")
268+
def test_slurm_cancel_rerun(tmpdir):
269+
""" testing that tasks run with slurm is re-queue
270+
Running wf with 2 tasks, one sleeps and the other trying to get
271+
job_id of the first task and cancel it.
272+
The first job should be re-queue and finish without problem.
273+
(possibly has to be improved, in theory cancel job might finish before cancel)
274+
"""
275+
276+
@mark.task
277+
def sleep(x):
278+
time.sleep(x)
279+
return x
280+
281+
@mark.task
282+
def cancel(job_name_part):
283+
import subprocess as sp
284+
285+
# getting the job_id of the first job that sleeps
286+
job_id = ""
287+
while job_id == "":
288+
time.sleep(1)
289+
id_p1 = sp.Popen(["squeue"], stdout=sp.PIPE)
290+
id_p2 = sp.Popen(
291+
["grep", job_name_part], stdin=id_p1.stdout, stdout=sp.PIPE
292+
)
293+
id_p3 = sp.Popen(["awk", "{print $1}"], stdin=id_p2.stdout, stdout=sp.PIPE)
294+
job_id = id_p3.communicate()[0].decode("utf-8").strip()
295+
296+
# # canceling the job
297+
proc1 = sp.run(["scancel", job_id])
298+
# checking the status of jobs with the name; returning the last item
299+
proc2 = sp.run(["sacct", "-j", job_id], stdout=sp.PIPE, stderr=sp.PIPE)
300+
return proc2.stdout.decode("utf-8").strip() # .split("\n")[-1]
301+
302+
wf = Workflow(name="wf", input_spec=["x", "job_name"], cache_dir=tmpdir)
303+
wf.add(sleep(name="sleep", x=wf.lzin.x))
304+
wf.add(cancel(nane="cancel", job_name_part=wf.lzin.job_name))
305+
# this is a job name for x=10, if x is different checksum and jobname would have to be updated
306+
wf.inputs.x = 20
307+
wf.inputs.job_name = "sleep"
308+
309+
wf.set_output([("out", wf.sleep.lzout.out), ("canc_out", wf.cancel.lzout.out)])
310+
with Submitter("slurm") as sub:
311+
sub(wf)
312+
313+
res = wf.result()
314+
assert res.output.out == 20
315+
# checking if indeed the sleep-task job was cancelled by cancel-task
316+
assert "CANCELLED" in res.output.canc_out
317+
breakpoint()
318+
script_dir = tmpdir / "SlurmWorker_scripts"
319+
assert script_dir.exists()

pydra/engine/workers.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,10 @@ async def _submit_job(self, batchscript, name, checksum, cache_dir):
300300
# Exception: Polling / job failure
301301
done = await self._poll_job(jobid)
302302
if done:
303-
if done == "CANCELLED":
304-
(cache_dir / f"{checksum}.lock").unlink(missing_ok=True)
303+
if done in ["CANCELLED", "TIMEOUT", "PREEMPTED"]:
304+
if (cache_dir / f"{checksum}.lock").exists():
305+
# for pyt3.8 we could you missing_ok=True
306+
(cache_dir / f"{checksum}.lock").unlink()
305307
cmd_re = ("scontrol", "requeue", jobid)
306308
await read_and_display_async(*cmd_re, hide_display=True)
307309
else:
@@ -326,8 +328,8 @@ async def _verify_exit_code(self, jobid):
326328
m = self._sacct_re.search(stdout)
327329
error_file = self.error[jobid]
328330
if int(m.group("exit_code")) != 0 or m.group("status") != "COMPLETED":
329-
if m.group("status") == "CANCELLED":
330-
return "CANCELLED"
331+
if m.group("status") in ["CANCELLED", "TIMEOUT", "PREEMPTED"]:
332+
return m.group("status")
331333
elif m.group("status") in ["RUNNING", "PENDING"]:
332334
return False
333335
# TODO: potential for requeuing

0 commit comments

Comments
 (0)