Skip to content

Commit 79309dc

Browse files
authored
Merge pull request #414 from ExaWorks/hostlist
Hostlist
2 parents d9d2370 + c952604 commit 79309dc

File tree

13 files changed

+145
-12
lines changed

13 files changed

+145
-12
lines changed

src/psij/executors/batch/batch_scheduler_executor.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ class BatchSchedulerExecutor(JobExecutor):
154154
2. store the exit code of the launch command in the *exit code file* named
155155
`<native_id>.ec`, also inside `<script_dir>`.
156156
157+
Additionally, where appropriate, the submit script should set the environment variable named
158+
``PSIJ_NODEFILE`` to point to a file containing a list of nodes that are allocated for the job,
159+
one per line, with a total number of lines matching the process count of the job.
160+
157161
Once the submit script is generated, the executor renders the submit command using
158162
:func:`~get_submit_command` and executes it. Its output is then parsed using
159163
:func:`~job_id_from_submit_output` to retrieve the `native_id` of the job. Subsequently, the

src/psij/executors/batch/cobalt/cobalt.mustache

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ only results in empty files that are not cleaned up}}
4646
#COBALT -e /dev/null
4747
#COBALT -o /dev/null
4848

49+
{{!like PBS, this is also cheap and there is not need to check setting}}
50+
PSIJ_NODEFILE="$COBALT_NODEFILE"
51+
export PSIJ_NODEFILE
4952

5053
{{!redirect output here instead of through #COBALT directive since COBALT_JOB_ID is not available
5154
when the directives are evaluated; the reason for using the job id in the first place being the

src/psij/executors/batch/lsf/lsf.mustache

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ only results in empty files that are not cleaned up}}
7171
#BSUB -e /dev/null
7272
#BSUB -o /dev/null
7373

74+
PSIJ_NODEFILE="$LSB_HOSTS"
75+
export PSIJ_NODEFILE
76+
7477
{{!redirect output here instead of through #BSUB directive since LSB_JOBID is not available
7578
when the directives are evaluated; the reason for using the job id in the first place being the
7679
same as for the exit code file.}}

src/psij/executors/batch/pbspro/pbspro.mustache

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ only results in empty files that are not cleaned up}}
4848
#PBS -v {{name}}={{value}}
4949
{{/env}}
5050

51+
PSIJ_NODEFILE="$PBS_NODEFILE"
52+
export PSIJ_NODEFILE
53+
5154

5255
{{#job.spec.directory}}
5356
cd "{{.}}"

src/psij/executors/batch/slurm.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,3 +185,7 @@ def _format_duration(self, d: timedelta) -> str:
185185
if d.days > 0:
186186
days = str(d.days) + '-'
187187
return days + "%s:%s:%s" % (d.seconds // 3600, (d.seconds // 60) % 60, d.seconds % 60)
188+
189+
def _clean_submit_script(self, job: Job) -> None:
190+
super()._clean_submit_script(job)
191+
self._delete_aux_file(job, '.nodefile')

src/psij/executors/batch/slurm/slurm.mustache

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ only results in empty files that are not cleaned up}}
7474
#SBATCH -e /dev/null
7575
#SBATCH -o /dev/null
7676

77+
PSIJ_NODEFILE="{{psij.script_dir}}/$SLURM_JOB_ID.nodefile"
78+
scontrol show hostnames >"$PSIJ_NODEFILE"
79+
export PSIJ_NODEFILE
80+
81+
7782
{{#env}}
7883
#SBATCH --export={{name}}={{value}}
7984
{{/env}}

src/psij/executors/local.py

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,35 @@
11
"""This module contains the local :class:`~psij.JobExecutor`."""
22
import logging
33
import os
4+
import shlex
45
import signal
56
import subprocess
67
import threading
78
import time
89
from abc import ABC, abstractmethod
10+
from tempfile import mkstemp
911
from types import FrameType
1012
from typing import Optional, Dict, List, Tuple, Type, cast
1113

1214
import psutil
1315

14-
from psij import InvalidJobException, SubmitException, Launcher
16+
from psij import InvalidJobException, SubmitException, Launcher, ResourceSpecV1
1517
from psij import Job, JobSpec, JobExecutorConfig, JobState, JobStatus
1618
from psij import JobExecutor
1719
from psij.utils import SingletonThread
1820

1921
logger = logging.getLogger(__name__)
2022

2123

24+
def _format_shell_cmd(args: List[str]) -> str:
25+
"""Formats an argument list in a way that allows it to be pasted in a shell."""
26+
cmd = ''
27+
for arg in args:
28+
cmd += shlex.quote(arg)
29+
cmd += ' '
30+
return cmd
31+
32+
2233
def _handle_sigchld(signum: int, frame: Optional[FrameType]) -> None:
2334
_ProcessReaper.get_instance()._handle_sigchld()
2435

@@ -67,6 +78,7 @@ class _ChildProcessEntry(_ProcessEntry):
6778
def __init__(self, job: Job, executor: 'LocalJobExecutor',
6879
launcher: Optional[Launcher]) -> None:
6980
super().__init__(job, executor, launcher)
81+
self.nodefile: Optional[str] = None
7082

7183
def kill(self) -> None:
7284
super().kill()
@@ -75,6 +87,8 @@ def poll(self) -> Tuple[Optional[int], Optional[str]]:
7587
assert self.process is not None
7688
exit_code = self.process.poll()
7789
if exit_code is not None:
90+
if self.nodefile:
91+
os.unlink(self.nodefile)
7892
if self.process.stdout:
7993
return exit_code, self.process.stdout.read().decode('utf-8')
8094
else:
@@ -103,19 +117,30 @@ def poll(self) -> Tuple[Optional[int], Optional[str]]:
103117
return None, None
104118

105119

106-
def _get_env(spec: JobSpec) -> Optional[Dict[str, str]]:
120+
def _get_env(spec: JobSpec, nodefile: Optional[str]) -> Optional[Dict[str, str]]:
121+
env: Optional[Dict[str, str]] = None
107122
if spec.inherit_environment:
108-
if not spec.environment:
123+
if spec.environment is None and nodefile is None:
109124
# if env is none in Popen, it inherits env from parent
110125
return None
111126
else:
112127
# merge current env with spec env
113128
env = os.environ.copy()
114-
env.update(spec.environment)
129+
if spec.environment:
130+
env.update(spec.environment)
131+
if nodefile is not None:
132+
env['PSIJ_NODEFILE'] = nodefile
115133
return env
116134
else:
117135
# only spec env
118-
return spec.environment
136+
if nodefile is None:
137+
env = spec.environment
138+
else:
139+
env = {'PSIJ_NODEFILE': nodefile}
140+
if spec.environment:
141+
env.update(spec.environment)
142+
143+
return env
119144

120145

121146
class _ProcessReaper(SingletonThread):
@@ -222,6 +247,26 @@ def __init__(self, url: Optional[str] = None,
222247
super().__init__(url=url, config=config if config else JobExecutorConfig())
223248
self._reaper = _ProcessReaper.get_instance()
224249

250+
def _generate_nodefile(self, job: Job, p: _ChildProcessEntry) -> Optional[str]:
251+
assert job.spec is not None
252+
if job.spec.resources is None:
253+
return None
254+
if job.spec.resources.version == 1:
255+
assert isinstance(job.spec.resources, ResourceSpecV1)
256+
n = job.spec.resources.computed_process_count
257+
if n == 1:
258+
# as a bit of an optimization, we don't generate a nodefile when doing "single
259+
# node" jobs on local.
260+
return None
261+
(file, p.nodefile) = mkstemp(suffix='.nodelist')
262+
for i in range(n):
263+
os.write(file, 'localhost\n'.encode())
264+
os.close(file)
265+
return p.nodefile
266+
else:
267+
raise SubmitException('Cannot handle resource specification with version %s'
268+
% job.spec.resources.version)
269+
225270
def submit(self, job: Job) -> None:
226271
"""
227272
Submits the specified :class:`~psij.Job` to be run locally.
@@ -244,9 +289,12 @@ def submit(self, job: Job) -> None:
244289
with job._status_cv:
245290
if job.status.state == JobState.CANCELED:
246291
raise SubmitException('Job canceled')
247-
logger.debug('Running %s, out=%s, err=%s', args, spec.stdout_path, spec.stderr_path)
292+
if logger.isEnabledFor(logging.DEBUG):
293+
logger.debug('Running %s', _format_shell_cmd(args))
294+
nodefile = self._generate_nodefile(job, p)
295+
env = _get_env(spec, nodefile)
248296
p.process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
249-
close_fds=True, cwd=spec.directory, env=_get_env(spec))
297+
close_fds=True, cwd=spec.directory, env=env)
250298
self._reaper.register(p)
251299
job._native_id = p.process.pid
252300
self._set_job_status(job, JobStatus(JobState.QUEUED, time=time.time(),

src/psij/launchers/script_based_launcher.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,9 @@ def get_additional_args(self, job: Job) -> List[str]:
202202

203203
def is_launcher_failure(self, output: str) -> bool:
204204
"""See :func:`~psij.job_launcher.Launcher.is_launcher_failure`."""
205-
return output.split('\n')[-1] != '_PSI_J_LAUNCHER_DONE'
205+
# last line should be an empty line
206+
return output.split('\n')[-2] != '_PSI_J_LAUNCHER_DONE'
206207

207208
def get_launcher_failure_message(self, output: str) -> str:
208209
"""See :func:`~psij.job_launcher.Launcher.get_launcher_failure_message`."""
209-
return '\n'.join(output.split('\n')[:-1])
210+
return '\n'.join(output.split('\n')[:-2])

tests/_test_tools.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ def assert_completed(job: Job, status: Optional[JobStatus]) -> None:
3434
assert job.spec is not None
3535
stdout = _read_file(job.spec.stdout_path)
3636
stderr = _read_file(job.spec.stderr_path)
37-
raise AssertionError('Job not completed. Status message: %s, stdout: %s, stderr: %s'
38-
% (status.message, stdout, stderr))
37+
raise AssertionError('Job not completed. Exit code: %s, Status message: %s, '
38+
'stdout: %s, stderr: %s'
39+
% (status.exit_code, status.message, stdout, stderr))
3940

4041

4142
def _get_executor_instance(ep: ExecutorTestParams, job: Optional[Job] = None) -> JobExecutor:

tests/plugins1/_batch_test/_batch_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ def _get_state(self, state: str) -> JobState:
9191
assert state in _TestJobExecutor._STATE_MAP
9292
return _TestJobExecutor._STATE_MAP[state]
9393

94+
def _clean_submit_script(self, job: Job):
95+
super()._clean_submit_script(job)
96+
self._delete_aux_file(job, '.nodefile')
97+
9498

9599
class _TestLauncher(MultipleLauncher):
96100
def __init__(self, config: Optional[JobExecutorConfig] = None):

0 commit comments

Comments
 (0)