3030_last_stdout_by_job : dict [str , str ] = {}
3131_last_stderr_by_job : dict [str , str ] = {}
3232_running_process_by_job : dict [str , subprocess .Popen [str ]] = {}
33+ _job_start_times : dict [str , datetime ] = {}
3334
35+ DEFAULT_MAX_RUNTIME_MINUTES = 120 # 2 hours default
36+ _workflow_max_runtime : dict [str , int ] = {}
3437
35- def _terminate_running_job (workflow_id : str ) -> None :
38+
39+ def _terminate_running_job_unlocked (workflow_id : str ) -> None :
40+ """Internal version that assumes _state_lock is already held."""
3641 running_process = _running_process_by_job .get (workflow_id )
3742 if running_process is None or running_process .poll () is not None :
43+ _running_process_by_job .pop (workflow_id , None ) # Clean stale entries
3844 return
3945
46+ # Terminate outside lock to avoid blocking
4047 logger .info ("Stopping previous cron workflow run for '%s'" , workflow_id )
4148 running_process .terminate ()
49+
4250 try :
4351 running_process .wait (timeout = 5 )
4452 except subprocess .TimeoutExpired :
4553 logger .warning ("Force killing previous cron workflow run for '%s'" , workflow_id )
46- running_process .kill ()
47- running_process .wait (timeout = 5 )
54+ try :
55+ running_process .kill ()
56+ running_process .wait (timeout = 5 )
57+ except subprocess .TimeoutExpired :
58+ logger .error ("Process '%s' became zombie - continuing anyway" , workflow_id )
59+
60+ # Clean up tracking (already under lock)
61+ _running_process_by_job .pop (workflow_id , None )
62+
63+
64+ def _terminate_running_job (workflow_id : str ) -> None :
65+ # Get process reference under lock
66+ with _state_lock :
67+ _terminate_running_job_unlocked (workflow_id )
4868
4969
5070def _resolve_script_path (repo_path : Path , shell_script_path : str ) -> Path :
@@ -61,6 +81,25 @@ def _tail_output(value: str, max_lines: int = 20) -> str:
6181 return "\n " .join (lines [- max_lines :])
6282
6383
84+ def _monitor_job_timeouts () -> None :
85+ """Periodically check for jobs exceeding runtime limits."""
86+ current_time = datetime .now (timezone .utc )
87+ with _state_lock :
88+ for workflow_id , start_time in list (_job_start_times .items ()):
89+ if workflow_id in _running_process_by_job :
90+ runtime_minutes = (current_time - start_time ).total_seconds () / 60
91+ max_runtime = _workflow_max_runtime .get (
92+ workflow_id , DEFAULT_MAX_RUNTIME_MINUTES
93+ )
94+ if runtime_minutes > max_runtime :
95+ logger .warning (
96+ "Job '%s' exceeded %d min limit, terminating" ,
97+ workflow_id ,
98+ max_runtime ,
99+ )
100+ _terminate_running_job_unlocked (workflow_id )
101+
102+
64103def _wait_for_workflow_process (
65104 workflow_id : str ,
66105 process : subprocess .Popen [str ],
@@ -78,6 +117,7 @@ def _wait_for_workflow_process(
78117 with _state_lock :
79118 if _running_process_by_job .get (workflow_id ) is process :
80119 _running_process_by_job .pop (workflow_id , None )
120+ _job_start_times .pop (workflow_id , None )
81121 _last_finished_by_job [workflow_id ] = finished_at
82122 _last_duration_ms_by_job [workflow_id ] = duration_ms
83123 _last_exit_code_by_job [workflow_id ] = returncode
@@ -102,7 +142,9 @@ def _wait_for_workflow_process(
102142 with _state_lock :
103143 _failed_jobs += 1
104144
105- logger .warning ("Cron workflow '%s' failed with exit code %s" , workflow_id , returncode )
145+ logger .warning (
146+ "Cron workflow '%s' failed with exit code %s" , workflow_id , returncode
147+ )
106148 if stdout_tail :
107149 logger .warning ("Cron workflow '%s' stdout: %s" , workflow_id , stdout_tail )
108150 if stderr_tail :
@@ -120,7 +162,7 @@ def _run_workflow_script(repo_path: Path, workflow_id: str, script_path: Path) -
120162 started_at = datetime .now (timezone .utc )
121163
122164 with _state_lock :
123- _terminate_running_job (workflow_id )
165+ _terminate_running_job_unlocked (workflow_id )
124166 _started_jobs += 1
125167 _last_run_by_job [workflow_id ] = datetime .now (timezone .utc )
126168 process = subprocess .Popen (
@@ -131,6 +173,7 @@ def _run_workflow_script(repo_path: Path, workflow_id: str, script_path: Path) -
131173 text = True ,
132174 )
133175 _running_process_by_job [workflow_id ] = process
176+ _job_start_times [workflow_id ] = started_at
134177
135178 logger .info ("Running cron workflow '%s' in '%s'" , workflow_id , repo_path )
136179 Thread (
@@ -141,7 +184,14 @@ def _run_workflow_script(repo_path: Path, workflow_id: str, script_path: Path) -
141184
142185
143186def start_cron_clock () -> None :
144- global _scheduler , _started_jobs , _successful_jobs , _failed_jobs , _configured_jobs , _invalid_jobs , _started_at
187+ global \
188+ _scheduler , \
189+ _started_jobs , \
190+ _successful_jobs , \
191+ _failed_jobs , \
192+ _configured_jobs , \
193+ _invalid_jobs , \
194+ _started_at
145195
146196 if _scheduler is not None :
147197 return
@@ -164,10 +214,18 @@ def start_cron_clock() -> None:
164214 shell_script_path = workflow .get ("shellScriptPath" )
165215 workflow_id = workflow .get ("id" ) or "workflow"
166216 if not isinstance (schedule , str ) or not schedule .strip ():
167- logger .warning ("Skipping workflow '%s' in '%s': missing schedule" , workflow_id , repo_name )
217+ logger .warning (
218+ "Skipping workflow '%s' in '%s': missing schedule" ,
219+ workflow_id ,
220+ repo_name ,
221+ )
168222 continue
169223 if not isinstance (shell_script_path , str ) or not shell_script_path .strip ():
170- logger .warning ("Skipping workflow '%s' in '%s': missing shellScriptPath" , workflow_id , repo_name )
224+ logger .warning (
225+ "Skipping workflow '%s' in '%s': missing shellScriptPath" ,
226+ workflow_id ,
227+ repo_name ,
228+ )
171229 continue
172230
173231 script_path = _resolve_script_path (repo_path , shell_script_path )
@@ -181,6 +239,12 @@ def start_cron_clock() -> None:
181239 continue
182240
183241 job_id = f"{ repo_name } :{ workflow_id } "
242+
243+ # Configure runtime limits
244+ max_runtime = workflow .get ("maxRuntimeMinutes" , DEFAULT_MAX_RUNTIME_MINUTES )
245+ if max_runtime :
246+ _workflow_max_runtime [job_id ] = max_runtime
247+
184248 try :
185249 scheduler .add_job (
186250 _run_workflow_script ,
@@ -205,6 +269,15 @@ def start_cron_clock() -> None:
205269 scheduler .start ()
206270 _scheduler = scheduler
207271
272+ # Start job timeout monitor
273+ scheduler .add_job (
274+ _monitor_job_timeouts ,
275+ "interval" ,
276+ minutes = 1 ,
277+ id = "_job_timeout_monitor" ,
278+ replace_existing = True ,
279+ )
280+
208281 with _state_lock :
209282 _started_jobs = 0
210283 _successful_jobs = 0
@@ -234,11 +307,13 @@ def stop_cron_clock() -> None:
234307 if _scheduler is None :
235308 return
236309
237- _scheduler .shutdown (wait = False )
310+ # Wait for all scheduled jobs (including timeout monitor) to complete
311+ # before acquiring locks to prevent deadlock
312+ _scheduler .shutdown (wait = True )
238313
239314 with _state_lock :
240315 for workflow_id in list (_running_process_by_job ):
241- _terminate_running_job (workflow_id )
316+ _terminate_running_job_unlocked (workflow_id )
242317 _running_process_by_job .pop (workflow_id , None )
243318
244319 _scheduler = None
@@ -323,6 +398,7 @@ def get_cron_job_diagnostics() -> dict[str, dict[str, object | None]]:
323398 errors = dict (_last_error_by_job )
324399 stdout_by_job = dict (_last_stdout_by_job )
325400 stderr_by_job = dict (_last_stderr_by_job )
401+ start_times = dict (_job_start_times )
326402 running = {
327403 workflow_id
328404 for workflow_id , process in _running_process_by_job .items ()
@@ -335,6 +411,12 @@ def get_cron_job_diagnostics() -> dict[str, dict[str, object | None]]:
335411 if job .next_run_time is not None :
336412 next_run_time = job .next_run_time .isoformat ()
337413
414+ runtime_minutes = None
415+ if job .id in start_times and job .id in running :
416+ runtime_minutes = int (
417+ (datetime .now (timezone .utc ) - start_times [job .id ]).total_seconds () / 60
418+ )
419+
338420 diagnostics [job .id ] = {
339421 "lastStartedAt" : last_runs .get (job .id ),
340422 "lastFinishedAt" : finished_runs .get (job .id ),
@@ -345,6 +427,38 @@ def get_cron_job_diagnostics() -> dict[str, dict[str, object | None]]:
345427 "lastStderr" : stderr_by_job .get (job .id ),
346428 "nextRunAt" : next_run_time ,
347429 "running" : job .id in running ,
430+ "runtimeMinutes" : runtime_minutes ,
348431 }
349432
350433 return diagnostics
434+
435+
436+ def force_terminate_job (workflow_id : str ) -> bool :
437+ """Admin function to manually terminate a job."""
438+ with _state_lock :
439+ if workflow_id not in _running_process_by_job :
440+ return False
441+
442+ _terminate_running_job (workflow_id )
443+ return True
444+
445+
446+ def get_long_running_jobs (threshold_minutes : int = 60 ) -> list [dict ]:
447+ """Get jobs running longer than threshold."""
448+ current_time = datetime .now (timezone .utc )
449+ long_running = []
450+
451+ with _state_lock :
452+ for workflow_id , start_time in _job_start_times .items ():
453+ if workflow_id in _running_process_by_job :
454+ runtime = (current_time - start_time ).total_seconds () / 60
455+ if runtime > threshold_minutes :
456+ long_running .append (
457+ {
458+ "workflow_id" : workflow_id ,
459+ "runtime_minutes" : int (runtime ),
460+ "started_at" : start_time .isoformat (),
461+ }
462+ )
463+
464+ return long_running
0 commit comments