Skip to content

Commit 2ce9b02

Browse files
committed
Try to run slurm tests
- ensure n is serialized in batch launchers - define signal command for batch launchers
1 parent 1568745 commit 2ce9b02

File tree

6 files changed

+161
-36
lines changed

6 files changed

+161
-36
lines changed

.github/workflows/test.yml

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,17 @@ jobs:
1919
matrix:
2020
include:
2121
- python: "3.9"
22-
ssh: ssh
22+
cluster_type: ssh
23+
- python: "3.8"
24+
cluster_type: mpi
25+
- python: "3.7"
26+
cluster_type: slurm
2327
- python: "3.6"
2428
tornado: "5.1.1"
2529
- python: "3.7"
2630
controller_ip: "*"
2731
- python: "3.8"
2832
runs_on: windows-2019
29-
- python: "3.8"
30-
mpi: mpi
3133
- python: "3.9"
3234
runs_on: macos-10.15
3335

@@ -47,15 +49,27 @@ jobs:
4749
echo "IPP_CONTROLLER_IP=${{ matrix.controller_ip }}" >> $GITHUB_ENV
4850
4951
- name: Set up docker-compose for ssh launcher
50-
if: ${{ matrix.ssh }}
52+
if: ${{ matrix.cluster_type == 'ssh' }}
5153
run: |
5254
export DOCKER_BUILDKIT=1
5355
export COMPOSE_DOCKER_CLI_BUILD=1
5456
cd ci/ssh
5557
docker-compose up -d --build
5658
59+
- name: Set up slurm
60+
if: ${{ matrix.cluster_type == 'slurm' }}
61+
run: |
62+
sudo rm -rf /var/lib/apt/lists
63+
sudo apt-get update && sudo apt-get -f -y install && sudo apt-get -y install slurm-wlm
64+
sudo cp ci/slurm/slurm.conf /etc/slurm-llnl/
65+
sudo mkdir /var/spool/slurmctl
66+
sudo mkdir /var/spool/slurmd
67+
sudo service munge start
68+
sudo service slurmd start
69+
sudo service slurmctld start
70+
5771
- name: Install Python (conda) ${{ matrix.python }}
58-
if: matrix.mpi
72+
if: ${{ matrix.cluster_type == 'mpi' }}
5973
run: |
6074
export MAMBA_ROOT_PREFIX=$HOME/conda
6175
test -d $MAMBA_ROOT_PREFIX || mkdir $MAMBA_ROOT_PREFIX
@@ -66,7 +80,7 @@ jobs:
6680
echo "PATH=$MAMBA_ROOT_PREFIX/bin:$PATH" >> $GITHUB_ENV
6781
6882
- name: Install Python ${{ matrix.python }}
69-
if: ${{ ! matrix.mpi }}
83+
if: ${{ matrix.cluster_type != 'mpi' }}
7084
uses: actions/setup-python@v2
7185
with:
7286
python-version: ${{ matrix.python }}
@@ -85,23 +99,29 @@ jobs:
8599
- name: Show environment
86100
run: pip freeze
87101

88-
- name: Run mpi tests
89-
if: ${{ matrix.mpi }}
102+
- name: Run ${{ matrix.cluster_type }} tests
103+
if: ${{ matrix.cluster_type }}
90104
run: |
91-
pytest -vx --color=yes --cov=ipyparallel ipyparallel/tests/test_cluster.py
92-
93-
- name: Run ssh tests
94-
if: ${{ matrix.ssh }}
95-
run: |
96-
pytest -ra -v --maxfail=2 --color=yes --cov=ipyparallel ipyparallel/tests/test_ssh.py
105+
pytest -ra -v --maxfail=2 --color=yes --cov=ipyparallel ipyparallel/tests/test_${{ matrix.cluster_type }}.py
97106
98107
- name: Run tests
99-
if: ${{ ! matrix.mpi && ! matrix.ssh }}
108+
if: ${{ ! matrix.cluster_type }}
100109
# FIXME: --color=yes explicitly set because:
101110
# https://github.com/actions/runner/issues/241
102111
run: |
103-
pytest -v --maxfail=3 --color=yes --cov=ipyparallel ipyparallel/tests
112+
pytest -ra -v --maxfail=3 --color=yes --cov=ipyparallel ipyparallel/tests
104113
105114
- name: Submit codecov report
106115
run: |
107116
codecov
117+
118+
- name: Report on slurm
119+
if: ${{ matrix.cluster_type == 'slurm' && failure() }}
120+
run: |
121+
set -x
122+
slurmd -C
123+
ls -l
124+
squeue
125+
sinfo
126+
scontrol show node=localhost
127+
sudo cat /var/log/slurm-llnl/*

ci/slurm/slurm.conf

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# slurm.conf file generated by configurator easy.html.
2+
# Put this file on all nodes of your cluster.
3+
# See the slurm.conf man page for more information.
4+
#
5+
SlurmctldHost=localhost
6+
#
7+
#MailProg=/bin/mail
8+
MpiDefault=none
9+
#MpiParams=ports=#-#
10+
ProctrackType=proctrack/linuxproc
11+
ReturnToService=1
12+
SlurmctldPidFile=/var/run/slurmctld.pid
13+
#SlurmctldPort=6817
14+
SlurmdPidFile=/var/run/slurmd.pid
15+
#SlurmdPort=6818
16+
SlurmdSpoolDir=/var/spool/slurmd
17+
SlurmUser=root
18+
#SlurmdUser=root
19+
StateSaveLocation=/var/spool/slurmctl
20+
SwitchType=switch/none
21+
#TaskPlugin=task/affinity
22+
#CoreSpecPlugin=core_spec/none
23+
24+
#
25+
# TIMERS
26+
#KillWait=30
27+
#MinJobAge=300
28+
#SlurmctldTimeout=120
29+
#SlurmdTimeout=300
30+
#
31+
#
32+
# SCHEDULING
33+
SchedulerType=sched/backfill
34+
SelectType=select/cons_tres
35+
SelectTypeParameters=CR_Core
36+
#
37+
#
38+
# LOGGING AND ACCOUNTING
39+
AccountingStorageType=accounting_storage/none
40+
ClusterName=cluster
41+
#JobAcctGatherFrequency=30
42+
JobAcctGatherType=jobacct_gather/none
43+
SlurmctldDebug=debug5
44+
SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
45+
SlurmdDebug=debug5
46+
SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
47+
#
48+
#
49+
# COMPUTE NODES
50+
NodeName=localhost NodeHostName=localhost NodeAddr=127.0.0.1 CPUs=3 State=UNKNOWN
51+
PartitionName=part Nodes=localhost Default=YES MaxTime=INFINITE State=UP OverSubscribe=YES

ipyparallel/cluster/launcher.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,6 +1586,12 @@ def _cluster_id_default(self):
15861586
help="The name of the command line program used to delete jobs.",
15871587
)
15881588

1589+
signal_command = List(
1590+
[''],
1591+
config=True,
1592+
help="The name of the command line program used to send signals to jobs.",
1593+
)
1594+
15891595
job_id = Unicode().tag(to_dict=True)
15901596

15911597
job_id_regexp = CRegExp(
@@ -1616,7 +1622,7 @@ def _cluster_id_default(self):
16161622
def _queue_changed(self, change):
16171623
self._update_context(change)
16181624

1619-
n = Integer(1)
1625+
n = Integer(1).tag(to_dict=True)
16201626

16211627
@observe('n')
16221628
def _n_changed(self, change):
@@ -1643,7 +1649,7 @@ def _n_changed(self, change):
16431649
This lets you parameterize additional options,
16441650
such as wall_time with a custom template.
16451651
""",
1646-
)
1652+
).tag(to_dict=True)
16471653

16481654
@default("context")
16491655
def _context_default(self):
@@ -1748,26 +1754,33 @@ def start(self, n=1):
17481754

17491755
def stop(self):
17501756
try:
1751-
p = Popen(
1757+
output = check_output(
17521758
self.delete_command + [self.job_id],
1753-
env=os.environ,
1754-
stdout=PIPE,
1755-
stderr=PIPE,
1756-
)
1757-
out, err = p.communicate()
1758-
output = out + err
1759-
except:
1759+
stdin=None,
1760+
).decode(DEFAULT_ENCODING, 'replace')
1761+
except Exception:
17601762
self.log.exception(
17611763
"Problem stopping cluster with command: %s"
17621764
% (self.delete_command + [self.job_id])
17631765
)
17641766
output = ""
1765-
output = output.decode(DEFAULT_ENCODING, 'replace')
1767+
17661768
self.notify_stop(
17671769
dict(job_id=self.job_id, output=output)
17681770
) # Pass the output of the kill cmd
17691771
return output
17701772

1773+
def signal(self, sig):
1774+
cmd = self.signal_command + [str(sig), self.job_id]
1775+
try:
1776+
output = check_output(
1777+
cmd,
1778+
stdin=None,
1779+
).decode(DEFAULT_ENCODING, 'replace')
1780+
except Exception:
1781+
self.log.exception("Problem sending signal with: {shlex_join(cmd)}")
1782+
output = ""
1783+
17711784

17721785
class BatchControllerLauncher(BatchSystemLauncher, ControllerLauncher):
17731786
@default("program")
@@ -1813,6 +1826,9 @@ class PBSLauncher(BatchSystemLauncher):
18131826

18141827
submit_command = List(['qsub'], config=True, help="The PBS submit command ['qsub']")
18151828
delete_command = List(['qdel'], config=True, help="The PBS delete command ['qdel']")
1829+
signal_command = List(
1830+
['qsig', '-s'], config=True, help="The PBS signal command ['qsig']"
1831+
)
18161832
job_id_regexp = CRegExp(
18171833
r'\d+',
18181834
config=True,
@@ -1868,6 +1884,11 @@ class SlurmLauncher(BatchSystemLauncher):
18681884
delete_command = List(
18691885
['scancel'], config=True, help="The slurm delete command ['scancel']"
18701886
)
1887+
signal_command = List(
1888+
['scancel', '-s'],
1889+
config=True,
1890+
help="The slurm signal command ['scancel', '-s']",
1891+
)
18711892
job_id_regexp = CRegExp(
18721893
r'\d+',
18731894
config=True,
@@ -2023,9 +2044,12 @@ class SGEEngineSetLauncher(SGELauncher, BatchEngineSetLauncher):
20232044
class LSFLauncher(BatchSystemLauncher):
20242045
"""A BatchSystemLauncher subclass for LSF."""
20252046

2026-
submit_command = List(['bsub'], config=True, help="The PBS submit command ['bsub']")
2047+
submit_command = List(['bsub'], config=True, help="The LSF submit command ['bsub']")
20272048
delete_command = List(
2028-
['bkill'], config=True, help="The PBS delete command ['bkill']"
2049+
['bkill'], config=True, help="The LSF delete command ['bkill']"
2050+
)
2051+
signal_command = List(
2052+
['bkill', '-s'], config=True, help="The LSF signal command ['bkill', '-s']"
20292053
)
20302054
job_id_regexp = CRegExp(
20312055
r'\d+',

ipyparallel/tests/test_cluster.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,6 @@
1212
from ipyparallel import cluster
1313
from ipyparallel.cluster.launcher import find_launcher_class
1414

15-
_engine_launcher_classes = ["Local"]
16-
if shutil.which("mpiexec"):
17-
_engine_launcher_classes.append("MPI")
18-
1915
_timeout = 30
2016

2117

@@ -58,9 +54,9 @@ async def test_ipython_log(ipython):
5854
assert c.log.handlers[0].stream is sys.stdout
5955

6056

61-
@pytest.fixture(params=_engine_launcher_classes)
62-
def engine_launcher_class(request):
63-
return request.param
57+
@pytest.fixture
58+
def engine_launcher_class():
59+
return 'Local'
6460

6561

6662
async def test_start_stop_controller(Cluster):

ipyparallel/tests/test_mpi.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import shutil
2+
3+
import pytest
4+
5+
from .test_cluster import test_restart_engines # noqa: F401
6+
from .test_cluster import test_signal_engines # noqa: F401
7+
from .test_cluster import test_start_stop_cluster # noqa: F401
8+
from .test_cluster import test_to_from_dict # noqa: F401
9+
10+
# import tests that use engine_launcher_class fixture
11+
12+
# override engine_launcher_class
13+
@pytest.fixture
14+
def engine_launcher_class():
15+
if shutil.which("mpiexec") is None:
16+
pytest.skip("Requires mpiexec")
17+
return 'MPI'

ipyparallel/tests/test_slurm.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import shutil
2+
3+
import pytest
4+
5+
from .test_cluster import test_restart_engines # noqa: F401
6+
from .test_cluster import test_signal_engines # noqa: F401
7+
from .test_cluster import test_start_stop_cluster # noqa: F401
8+
from .test_cluster import test_to_from_dict # noqa: F401
9+
10+
# import tests that use engine_launcher_class fixture
11+
12+
# override engine_launcher_class
13+
@pytest.fixture
14+
def engine_launcher_class():
15+
if shutil.which("sbatch") is None:
16+
pytest.skip("Requires slurm")
17+
return 'Slurm'

0 commit comments

Comments
 (0)