Skip to content

Commit 83e37b4

Browse files
authored
Merge pull request #516 from minrk/slurm
Slurm test coverage
2 parents 1568745 + 07ceabd commit 83e37b4

File tree

6 files changed

+170
-42
lines changed

6 files changed

+170
-42
lines changed

.github/workflows/test.yml

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,17 @@ jobs:
1919
matrix:
2020
include:
2121
- python: "3.9"
22-
ssh: ssh
22+
cluster_type: ssh
23+
- python: "3.8"
24+
cluster_type: mpi
25+
- python: "3.7"
26+
cluster_type: slurm
2327
- python: "3.6"
2428
tornado: "5.1.1"
2529
- python: "3.7"
2630
controller_ip: "*"
2731
- python: "3.8"
2832
runs_on: windows-2019
29-
- python: "3.8"
30-
mpi: mpi
3133
- python: "3.9"
3234
runs_on: macos-10.15
3335

@@ -47,15 +49,27 @@ jobs:
4749
echo "IPP_CONTROLLER_IP=${{ matrix.controller_ip }}" >> $GITHUB_ENV
4850
4951
- name: Set up docker-compose for ssh launcher
50-
if: ${{ matrix.ssh }}
52+
if: ${{ matrix.cluster_type == 'ssh' }}
5153
run: |
5254
export DOCKER_BUILDKIT=1
5355
export COMPOSE_DOCKER_CLI_BUILD=1
5456
cd ci/ssh
5557
docker-compose up -d --build
5658
59+
- name: Set up slurm
60+
if: ${{ matrix.cluster_type == 'slurm' }}
61+
run: |
62+
sudo rm -rf /var/lib/apt/lists
63+
sudo apt-get update && sudo apt-get -f -y install && sudo apt-get -y install slurm-wlm
64+
sudo cp ci/slurm/slurm.conf /etc/slurm-llnl/
65+
sudo mkdir /var/spool/slurmctl
66+
sudo mkdir /var/spool/slurmd
67+
sudo service munge start
68+
sudo service slurmd start
69+
sudo service slurmctld start
70+
5771
- name: Install Python (conda) ${{ matrix.python }}
58-
if: matrix.mpi
72+
if: ${{ matrix.cluster_type == 'mpi' }}
5973
run: |
6074
export MAMBA_ROOT_PREFIX=$HOME/conda
6175
test -d $MAMBA_ROOT_PREFIX || mkdir $MAMBA_ROOT_PREFIX
@@ -66,7 +80,7 @@ jobs:
6680
echo "PATH=$MAMBA_ROOT_PREFIX/bin:$PATH" >> $GITHUB_ENV
6781
6882
- name: Install Python ${{ matrix.python }}
69-
if: ${{ ! matrix.mpi }}
83+
if: ${{ matrix.cluster_type != 'mpi' }}
7084
uses: actions/setup-python@v2
7185
with:
7286
python-version: ${{ matrix.python }}
@@ -85,23 +99,29 @@ jobs:
8599
- name: Show environment
86100
run: pip freeze
87101

88-
- name: Run mpi tests
89-
if: ${{ matrix.mpi }}
102+
- name: Run ${{ matrix.cluster_type }} tests
103+
if: ${{ matrix.cluster_type }}
90104
run: |
91-
pytest -vx --color=yes --cov=ipyparallel ipyparallel/tests/test_cluster.py
92-
93-
- name: Run ssh tests
94-
if: ${{ matrix.ssh }}
95-
run: |
96-
pytest -ra -v --maxfail=2 --color=yes --cov=ipyparallel ipyparallel/tests/test_ssh.py
105+
pytest -ra -v --maxfail=2 --color=yes --cov=ipyparallel ipyparallel/tests/test_${{ matrix.cluster_type }}.py
97106
98107
- name: Run tests
99-
if: ${{ ! matrix.mpi && ! matrix.ssh }}
108+
if: ${{ ! matrix.cluster_type }}
100109
# FIXME: --color=yes explicitly set because:
101110
# https://github.com/actions/runner/issues/241
102111
run: |
103-
pytest -v --maxfail=3 --color=yes --cov=ipyparallel ipyparallel/tests
112+
pytest -ra -v --maxfail=3 --color=yes --cov=ipyparallel ipyparallel/tests
104113
105114
- name: Submit codecov report
106115
run: |
107116
codecov
117+
118+
- name: Report on slurm
119+
if: ${{ matrix.cluster_type == 'slurm' && failure() }}
120+
run: |
121+
set -x
122+
slurmd -C
123+
ls -l
124+
squeue
125+
sinfo
126+
scontrol show node=localhost
127+
sudo cat /var/log/slurm-llnl/*

ci/slurm/slurm.conf

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# slurm.conf file generated by configurator easy.html.
2+
# Put this file on all nodes of your cluster.
3+
# See the slurm.conf man page for more information.
4+
#
5+
SlurmctldHost=localhost
6+
#
7+
#MailProg=/bin/mail
8+
MpiDefault=none
9+
#MpiParams=ports=#-#
10+
ProctrackType=proctrack/linuxproc
11+
ReturnToService=1
12+
SlurmctldPidFile=/var/run/slurmctld.pid
13+
#SlurmctldPort=6817
14+
SlurmdPidFile=/var/run/slurmd.pid
15+
#SlurmdPort=6818
16+
SlurmdSpoolDir=/var/spool/slurmd
17+
SlurmUser=root
18+
#SlurmdUser=root
19+
StateSaveLocation=/var/spool/slurmctl
20+
SwitchType=switch/none
21+
#TaskPlugin=task/affinity
22+
#CoreSpecPlugin=core_spec/none
23+
24+
#
25+
# TIMERS
26+
#KillWait=30
27+
#MinJobAge=300
28+
#SlurmctldTimeout=120
29+
#SlurmdTimeout=300
30+
#
31+
#
32+
# SCHEDULING
33+
SchedulerType=sched/backfill
34+
SelectType=select/cons_tres
35+
SelectTypeParameters=CR_Core
36+
#
37+
#
38+
# LOGGING AND ACCOUNTING
39+
AccountingStorageType=accounting_storage/none
40+
ClusterName=cluster
41+
#JobAcctGatherFrequency=30
42+
JobAcctGatherType=jobacct_gather/none
43+
SlurmctldDebug=debug5
44+
SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
45+
SlurmdDebug=debug5
46+
SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
47+
#
48+
#
49+
# COMPUTE NODES
50+
# Note: CPUs apparently cannot be oversubscribed
51+
# this can only run where at least 2 CPUs are available
52+
NodeName=localhost NodeHostName=localhost NodeAddr=127.0.0.1 CPUs=2 State=UNKNOWN
53+
PartitionName=part Nodes=localhost Default=YES MaxTime=INFINITE State=UP OverSubscribe=YES

ipyparallel/cluster/launcher.py

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,6 +1586,12 @@ def _cluster_id_default(self):
15861586
help="The name of the command line program used to delete jobs.",
15871587
)
15881588

1589+
signal_command = List(
1590+
[''],
1591+
config=True,
1592+
help="The name of the command line program used to send signals to jobs.",
1593+
)
1594+
15891595
job_id = Unicode().tag(to_dict=True)
15901596

15911597
job_id_regexp = CRegExp(
@@ -1616,7 +1622,7 @@ def _cluster_id_default(self):
16161622
def _queue_changed(self, change):
16171623
self._update_context(change)
16181624

1619-
n = Integer(1)
1625+
n = Integer(1).tag(to_dict=True)
16201626

16211627
@observe('n')
16221628
def _n_changed(self, change):
@@ -1643,7 +1649,7 @@ def _n_changed(self, change):
16431649
This lets you parameterize additional options,
16441650
such as wall_time with a custom template.
16451651
""",
1646-
)
1652+
).tag(to_dict=True)
16471653

16481654
@default("context")
16491655
def _context_default(self):
@@ -1714,7 +1720,7 @@ def write_batch_script(self, n=1):
17141720
# from user config
17151721
ns.update(self.namespace)
17161722
script_as_string = self.formatter.format(self.batch_template, **ns)
1717-
self.log.debug('Writing batch script: %s', self.batch_file)
1723+
self.log.debug(f'Writing batch script: {self.batch_file}\n{script_as_string}')
17181724
with open(self.batch_file, 'w') as f:
17191725
f.write(script_as_string)
17201726
os.chmod(self.batch_file, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
@@ -1739,35 +1745,44 @@ def start(self, n=1):
17391745
# Here we save profile_dir in the context so they
17401746
# can be used in the batch script template as {profile_dir}
17411747
self.write_batch_script(n)
1748+
17421749
output = check_output(self.args, env=os.environ)
17431750
output = output.decode(DEFAULT_ENCODING, 'replace')
1751+
self.log.debug(f"Submitted {shlex_join(self.args)}. Output: {output}")
17441752

17451753
job_id = self.parse_job_id(output)
17461754
self.notify_start(job_id)
17471755
return job_id
17481756

17491757
def stop(self):
17501758
try:
1751-
p = Popen(
1759+
output = check_output(
17521760
self.delete_command + [self.job_id],
1753-
env=os.environ,
1754-
stdout=PIPE,
1755-
stderr=PIPE,
1756-
)
1757-
out, err = p.communicate()
1758-
output = out + err
1759-
except:
1761+
stdin=None,
1762+
).decode(DEFAULT_ENCODING, 'replace')
1763+
except Exception:
17601764
self.log.exception(
17611765
"Problem stopping cluster with command: %s"
17621766
% (self.delete_command + [self.job_id])
17631767
)
17641768
output = ""
1765-
output = output.decode(DEFAULT_ENCODING, 'replace')
1769+
17661770
self.notify_stop(
17671771
dict(job_id=self.job_id, output=output)
17681772
) # Pass the output of the kill cmd
17691773
return output
17701774

1775+
def signal(self, sig):
1776+
cmd = self.signal_command + [str(sig), self.job_id]
1777+
try:
1778+
output = check_output(
1779+
cmd,
1780+
stdin=None,
1781+
).decode(DEFAULT_ENCODING, 'replace')
1782+
except Exception:
1783+
self.log.exception("Problem sending signal with: {shlex_join(cmd)}")
1784+
output = ""
1785+
17711786

17721787
class BatchControllerLauncher(BatchSystemLauncher, ControllerLauncher):
17731788
@default("program")
@@ -1813,6 +1828,9 @@ class PBSLauncher(BatchSystemLauncher):
18131828

18141829
submit_command = List(['qsub'], config=True, help="The PBS submit command ['qsub']")
18151830
delete_command = List(['qdel'], config=True, help="The PBS delete command ['qdel']")
1831+
signal_command = List(
1832+
['qsig', '-s'], config=True, help="The PBS signal command ['qsig']"
1833+
)
18161834
job_id_regexp = CRegExp(
18171835
r'\d+',
18181836
config=True,
@@ -1868,6 +1886,11 @@ class SlurmLauncher(BatchSystemLauncher):
18681886
delete_command = List(
18691887
['scancel'], config=True, help="The slurm delete command ['scancel']"
18701888
)
1889+
signal_command = List(
1890+
['scancel', '-s'],
1891+
config=True,
1892+
help="The slurm signal command ['scancel', '-s']",
1893+
)
18711894
job_id_regexp = CRegExp(
18721895
r'\d+',
18731896
config=True,
@@ -2023,9 +2046,12 @@ class SGEEngineSetLauncher(SGELauncher, BatchEngineSetLauncher):
20232046
class LSFLauncher(BatchSystemLauncher):
20242047
"""A BatchSystemLauncher subclass for LSF."""
20252048

2026-
submit_command = List(['bsub'], config=True, help="The PBS submit command ['bsub']")
2049+
submit_command = List(['bsub'], config=True, help="The LSF submit command ['bsub']")
20272050
delete_command = List(
2028-
['bkill'], config=True, help="The PBS delete command ['bkill']"
2051+
['bkill'], config=True, help="The LSF delete command ['bkill']"
2052+
)
2053+
signal_command = List(
2054+
['bkill', '-s'], config=True, help="The LSF signal command ['bkill', '-s']"
20292055
)
20302056
job_id_regexp = CRegExp(
20312057
r'\d+',

ipyparallel/tests/test_cluster.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import asyncio
22
import os
3-
import shutil
43
import signal
54
import sys
65
import time
@@ -12,10 +11,6 @@
1211
from ipyparallel import cluster
1312
from ipyparallel.cluster.launcher import find_launcher_class
1413

15-
_engine_launcher_classes = ["Local"]
16-
if shutil.which("mpiexec"):
17-
_engine_launcher_classes.append("MPI")
18-
1914
_timeout = 30
2015

2116

@@ -58,9 +53,9 @@ async def test_ipython_log(ipython):
5853
assert c.log.handlers[0].stream is sys.stdout
5954

6055

61-
@pytest.fixture(params=_engine_launcher_classes)
62-
def engine_launcher_class(request):
63-
return request.param
56+
@pytest.fixture
57+
def engine_launcher_class():
58+
return 'Local'
6459

6560

6661
async def test_start_stop_controller(Cluster):
@@ -88,7 +83,7 @@ async def test_start_stop_engines(Cluster, engine_launcher_class):
8883
cluster = Cluster(engine_launcher_class=engine_launcher_class)
8984
await cluster.start_controller()
9085

91-
n = 3
86+
n = 2
9287
engine_set_id = await cluster.start_engines(n)
9388
assert engine_set_id in cluster._engine_sets
9489
engine_set = cluster._engine_sets[engine_set_id]
@@ -127,10 +122,10 @@ async def test_start_stop_cluster(Cluster, engine_launcher_class):
127122
async def test_signal_engines(request, Cluster, engine_launcher_class):
128123
cluster = Cluster(engine_launcher_class=engine_launcher_class)
129124
await cluster.start_controller()
130-
engine_set_id = await cluster.start_engines(n=3)
125+
engine_set_id = await cluster.start_engines(n=2)
131126
rc = await cluster.connect_client()
132127
request.addfinalizer(rc.close)
133-
rc.wait_for_engines(3)
128+
rc.wait_for_engines(2)
134129
# seems to be a problem if we start too soon...
135130
await asyncio.sleep(1)
136131
# ensure responsive
@@ -157,7 +152,7 @@ async def test_signal_engines(request, Cluster, engine_launcher_class):
157152

158153

159154
async def test_restart_engines(Cluster, engine_launcher_class):
160-
n = 3
155+
n = 2
161156
async with Cluster(engine_launcher_class=engine_launcher_class, n=n) as rc:
162157
cluster = rc.cluster
163158
engine_set_id = next(iter(cluster._engine_sets))

ipyparallel/tests/test_mpi.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import shutil
2+
3+
import pytest
4+
5+
from .test_cluster import test_restart_engines # noqa: F401
6+
from .test_cluster import test_signal_engines # noqa: F401
7+
from .test_cluster import test_start_stop_cluster # noqa: F401
8+
from .test_cluster import test_to_from_dict # noqa: F401
9+
10+
# import tests that use engine_launcher_class fixture
11+
12+
# override engine_launcher_class
13+
@pytest.fixture
14+
def engine_launcher_class():
15+
if shutil.which("mpiexec") is None:
16+
pytest.skip("Requires mpiexec")
17+
return 'MPI'

ipyparallel/tests/test_slurm.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import shutil
2+
3+
import pytest
4+
5+
from .test_cluster import test_restart_engines # noqa: F401
6+
from .test_cluster import test_signal_engines # noqa: F401
7+
from .test_cluster import test_start_stop_cluster # noqa: F401
8+
from .test_cluster import test_to_from_dict # noqa: F401
9+
10+
# import tests that use engine_launcher_class fixture
11+
12+
# override engine_launcher_class
13+
@pytest.fixture
14+
def engine_launcher_class():
15+
if shutil.which("sbatch") is None:
16+
pytest.skip("Requires slurm")
17+
return 'Slurm'

0 commit comments

Comments
 (0)