@@ -68,6 +68,7 @@ def submit(self, job):
6868 stderr = f_stderr ,
6969 start_new_session = True
7070 )
71+ self .log (f'spawned local process: { proc .pid } ' )
7172
7273 # Update job info
7374 job ._jobid = proc .pid
@@ -96,6 +97,7 @@ def _kill_all(self, job):
9697 '''Send SIGKILL to all the processes of the spawned job and wait for
9798 any children to finish'''
9899 try :
100+ self .log (f'sending SIGKILL to process group { job ._jobid } ' )
99101 os .killpg (job ._jobid , signal .SIGKILL )
100102 except (ProcessLookupError , PermissionError ):
101103 # The process group may already be dead or assigned to a different
@@ -111,6 +113,7 @@ def _kill_all(self, job):
111113 def _term_all (self , job ):
112114 '''Send SIGTERM to all the processes of the spawned job.'''
113115 try :
116+ self .log (f'sending SIGTERM to process group { job ._jobid } ' )
114117 os .killpg (job ._jobid , signal .SIGTERM )
115118 except (ProcessLookupError , PermissionError ):
116119 # Job has finished already, close file handles
@@ -126,6 +129,7 @@ def cancel(self, job):
126129
127130 This function waits for the spawned process tree to finish.
128131 '''
132+ self .log (f'cancelling job { job ._jobid } ' )
129133 self ._term_all (job )
130134 job ._cancel_time = time .time ()
131135
@@ -172,6 +176,7 @@ def _poll_job(self, job):
172176
173177 if pid :
174178 # Job has finished
179+ self .log (f'spawned process { job ._jobid } has finished' )
175180
176181 # Forcefully kill the whole session once the parent process exits
177182 self ._kill_all (job )
@@ -190,17 +195,21 @@ def _poll_job(self, job):
190195 job ._state = 'FAILURE'
191196
192197 job ._signal = os .WTERMSIG (status )
198+ self .log (f'job killed by signal: { job ._signal } ' )
199+
200+ self .log (f'job state: { job ._state } ' )
193201 else :
194202 # Job has not finished; check for timeouts
195203 now = time .time ()
196204 t_elapsed = now - job .submit_time
197205 if job .cancel_time :
198206 t_rem = self .CANCEL_GRACE_PERIOD - (now - job .cancel_time )
199- self .log (f'Job { job .jobid } has been cancelled; '
207+ self .log (f'job { job .jobid } has been cancelled; '
200208 f'giving it a grace period of { t_rem } seconds' )
201209 if t_rem <= 0 :
202210 self ._kill_all (job )
203211 elif job .time_limit and t_elapsed > job .time_limit :
212+ self .log (f'job { job ._jobid } timed out; cancelling it' )
204213 self .cancel (job )
205214 job ._state = 'TIMEOUT'
206215 job ._exception = JobError (
0 commit comments