33#
44# SPDX-License-Identifier: BSD-3-Clause
55
6+ import contextlib
67import errno
78import os
89import signal
@@ -67,6 +68,7 @@ def submit(self, job):
6768 stderr = f_stderr ,
6869 start_new_session = True
6970 )
71+ self .log (f'spawned local process: { proc .pid } ' )
7072
7173 # Update job info
7274 job ._jobid = proc .pid
@@ -92,10 +94,11 @@ def filternodes(self, job, nodes):
9294 return [sched .AlwaysIdleNode (socket .gethostname ())]
9395
9496 def _kill_all (self , job ):
95- '''Send SIGKILL to all the processes of the spawned job.'''
97+ '''Send SIGKILL to all the processes of the spawned job and wait for
98+ any children to finish'''
9699 try :
100+ self .log (f'sending SIGKILL to process group { job ._jobid } ' )
97101 os .killpg (job ._jobid , signal .SIGKILL )
98- job ._signal = signal .SIGKILL
99102 except (ProcessLookupError , PermissionError ):
100103 # The process group may already be dead or assigned to a different
101104 # group, so ignore this error
@@ -104,19 +107,19 @@ def _kill_all(self, job):
104107 # Close file handles
105108 job .f_stdout .close ()
106109 job .f_stderr .close ()
107- job ._state = 'FAILURE'
110+ with contextlib .suppress (ChildProcessError ):
111+ os .waitpid (0 , 0 )
108112
109113 def _term_all (self , job ):
110114 '''Send SIGTERM to all the processes of the spawned job.'''
111115 try :
116+ self .log (f'sending SIGTERM to process group { job ._jobid } ' )
112117 os .killpg (job ._jobid , signal .SIGTERM )
113- job ._signal = signal .SIGTERM
114118 except (ProcessLookupError , PermissionError ):
115119 # Job has finished already, close file handles
116120 self .log (f'pid { job .jobid } already dead' )
117121 job .f_stdout .close ()
118122 job .f_stderr .close ()
119- job ._state = 'FAILURE'
120123
121124 def cancel (self , job ):
122125 '''Cancel job.
@@ -126,6 +129,7 @@ def cancel(self, job):
126129
127130 This function waits for the spawned process tree to finish.
128131 '''
132+ self .log (f'cancelling job { job ._jobid } ' )
129133 self ._term_all (job )
130134 job ._cancel_time = time .time ()
131135
@@ -150,10 +154,7 @@ def finished(self, job):
150154 the process has finished, you *must* call wait() to properly cleanup
151155 after it.
152156 '''
153- if job .exception :
154- raise job .exception
155-
156- return job .state in ['SUCCESS' , 'FAILURE' , 'TIMEOUT' ]
157+ return job .exitcode is not None or job .signal is not None
157158
158159 def poll (self , * jobs ):
159160 for job in jobs :
@@ -173,37 +174,45 @@ def _poll_job(self, job):
173174 else :
174175 raise e
175176
176- if job .cancel_time :
177- # Job has been cancelled; give it a grace period and kill it
178- self .log (f'Job { job .jobid } has been cancelled; '
179- f'giving it a grace period' )
180- t_rem = self .CANCEL_GRACE_PERIOD - (time .time () - job .cancel_time )
181- if t_rem > 0 :
182- time .sleep (t_rem )
177+ if pid :
178+ # Job has finished
179+ self .log (f'spawned process { job ._jobid } has finished' )
183180
181+ # Forcefully kill the whole session once the parent process exits
184182 self ._kill_all (job )
185- return
186183
187- if not pid :
188- # Job has not finished; check if we have reached a timeout
189- t_elapsed = time .time () - job .submit_time
190- if job .time_limit and t_elapsed > job .time_limit :
191- self ._kill_all (job )
184+ # Call wait() in the underlying Popen object to avoid false
185+ # positive warnings
186+ job ._proc .wait ()
187+
188+ # Retrieve the status of the job and return
189+ if os .WIFEXITED (status ):
190+ job ._exitcode = os .WEXITSTATUS (status )
191+ if job ._state == 'RUNNING' :
192+ job ._state = 'FAILURE' if job ._exitcode != 0 else 'SUCCESS'
193+ elif os .WIFSIGNALED (status ):
194+ if job ._state == 'RUNNING' :
195+ job ._state = 'FAILURE'
196+
197+ job ._signal = os .WTERMSIG (status )
198+ self .log (f'job killed by signal: { job ._signal } ' )
199+
200+ self .log (f'job state: { job ._state } ' )
201+ else :
202+ # Job has not finished; check for timeouts
203+ now = time .time ()
204+ t_elapsed = now - job .submit_time
205+ if job .cancel_time :
206+ t_rem = self .CANCEL_GRACE_PERIOD - (now - job .cancel_time )
207+ self .log (f'job { job .jobid } has been cancelled; '
208+ f'giving it a grace period of { t_rem } seconds' )
209+ if t_rem <= 0 :
210+ self ._kill_all (job )
211+ elif job .time_limit and t_elapsed > job .time_limit :
212+ self .log (f'job { job ._jobid } timed out; cancelling it' )
213+ self .cancel (job )
192214 job ._state = 'TIMEOUT'
193215 job ._exception = JobError (
194216 f'job timed out ({ t_elapsed :.6f} s > { job .time_limit } s)' ,
195217 job .jobid
196218 )
197-
198- return
199-
200- # Job has finished; kill the whole session
201- self ._kill_all (job )
202-
203- # Retrieve the status of the job and return
204- if os .WIFEXITED (status ):
205- job ._exitcode = os .WEXITSTATUS (status )
206- job ._state = 'FAILURE' if job .exitcode != 0 else 'SUCCESS'
207- elif os .WIFSIGNALED (status ):
208- job ._state = 'FAILURE'
209- job ._signal = os .WTERMSIG (status )
0 commit comments