1818_state_lock = Lock ()
1919_started_jobs = 0
2020_successful_jobs = 0
21+ _failed_jobs = 0
2122_configured_jobs = 0
2223_invalid_jobs = 0
2324_started_at : datetime | None = None
2425_last_run_by_job : dict [str , datetime ] = {}
26+ _last_finished_by_job : dict [str , datetime ] = {}
27+ _last_duration_ms_by_job : dict [str , int ] = {}
28+ _last_exit_code_by_job : dict [str , int ] = {}
29+ _last_error_by_job : dict [str , str ] = {}
2530_running_process_by_job : dict [str , subprocess .Popen [str ]] = {}
2631
2732
@@ -48,7 +53,9 @@ def _resolve_script_path(repo_path: Path, shell_script_path: str) -> Path:
4853
4954
5055def _run_workflow_script (repo_path : Path , workflow_id : str , script_path : Path ) -> None :
51- global _started_jobs , _successful_jobs
56+ global _started_jobs , _successful_jobs , _failed_jobs
57+
58+ started_at = datetime .now (timezone .utc )
5259
5360 with _state_lock :
5461 _terminate_running_job (workflow_id )
@@ -66,28 +73,42 @@ def _run_workflow_script(repo_path: Path, workflow_id: str, script_path: Path) -
6673 logger .info ("Running cron workflow '%s' in '%s'" , workflow_id , repo_path )
6774 stdout , stderr = process .communicate ()
6875 returncode = process .returncode
76+ finished_at = datetime .now (timezone .utc )
77+ duration_ms = int ((finished_at - started_at ).total_seconds () * 1000 )
6978
7079 with _state_lock :
7180 if _running_process_by_job .get (workflow_id ) is process :
7281 _running_process_by_job .pop (workflow_id , None )
82+ _last_finished_by_job [workflow_id ] = finished_at
83+ _last_duration_ms_by_job [workflow_id ] = duration_ms
84+ _last_exit_code_by_job [workflow_id ] = returncode
7385
7486 if returncode == 0 :
7587 with _state_lock :
7688 _successful_jobs += 1
89+ _last_error_by_job .pop (workflow_id , None )
7790 logger .info ("Cron workflow '%s' completed" , workflow_id )
7891 if stdout .strip ():
7992 logger .info ("Cron workflow '%s' stdout: %s" , workflow_id , stdout .strip ())
8093 return
8194
95+ with _state_lock :
96+ _failed_jobs += 1
97+
8298 logger .warning ("Cron workflow '%s' failed with exit code %s" , workflow_id , returncode )
8399 if stdout .strip ():
84100 logger .warning ("Cron workflow '%s' stdout: %s" , workflow_id , stdout .strip ())
85101 if stderr .strip ():
102+ with _state_lock :
103+ _last_error_by_job [workflow_id ] = stderr .strip ()
86104 logger .warning ("Cron workflow '%s' stderr: %s" , workflow_id , stderr .strip ())
105+ else :
106+ with _state_lock :
107+ _last_error_by_job [workflow_id ] = f"Exit code { returncode } without stderr"
87108
88109
89110def start_cron_clock () -> None :
90- global _scheduler , _started_jobs , _successful_jobs , _configured_jobs , _invalid_jobs , _started_at
111+ global _scheduler , _started_jobs , _successful_jobs , _failed_jobs , _configured_jobs , _invalid_jobs , _started_at
91112
92113 if _scheduler is not None :
93114 return
@@ -110,12 +131,20 @@ def start_cron_clock() -> None:
110131 shell_script_path = workflow .get ("shellScriptPath" )
111132 workflow_id = workflow .get ("id" ) or "workflow"
112133 if not isinstance (schedule , str ) or not schedule .strip ():
134+ logger .warning ("Skipping workflow '%s' in '%s': missing schedule" , workflow_id , repo_name )
113135 continue
114136 if not isinstance (shell_script_path , str ) or not shell_script_path .strip ():
137+ logger .warning ("Skipping workflow '%s' in '%s': missing shellScriptPath" , workflow_id , repo_name )
115138 continue
116139
117140 script_path = _resolve_script_path (repo_path , shell_script_path )
118141 if not script_path .exists () or not script_path .is_file ():
142+ logger .warning (
143+ "Skipping workflow '%s' in '%s': script not found at '%s'" ,
144+ workflow_id ,
145+ repo_name ,
146+ script_path ,
147+ )
119148 continue
120149
121150 job_id = f"{ repo_name } :{ workflow_id } "
@@ -125,8 +154,9 @@ def start_cron_clock() -> None:
125154 CronTrigger .from_crontab (schedule ),
126155 id = job_id ,
127156 replace_existing = True ,
128- max_instances = 2 ,
157+ max_instances = 1 ,
129158 coalesce = True ,
159+ misfire_grace_time = 300 ,
130160 args = [repo_path , job_id , script_path ],
131161 )
132162 configured_jobs += 1
@@ -145,10 +175,15 @@ def start_cron_clock() -> None:
145175 with _state_lock :
146176 _started_jobs = 0
147177 _successful_jobs = 0
178+ _failed_jobs = 0
148179 _configured_jobs = configured_jobs
149180 _invalid_jobs = invalid_jobs
150181 _started_at = datetime .now (timezone .utc )
151182 _last_run_by_job .clear ()
183+ _last_finished_by_job .clear ()
184+ _last_duration_ms_by_job .clear ()
185+ _last_exit_code_by_job .clear ()
186+ _last_error_by_job .clear ()
152187 _running_process_by_job .clear ()
153188
154189 logger .info (
@@ -186,6 +221,7 @@ def get_cron_clock_status() -> dict[str, object]:
186221 with _state_lock :
187222 started_jobs = _started_jobs
188223 successful_jobs = _successful_jobs
224+ failed_jobs = _failed_jobs
189225 configured_jobs = _configured_jobs
190226 invalid_jobs = _invalid_jobs
191227 started_at = _started_at
@@ -213,6 +249,7 @@ def get_cron_clock_status() -> dict[str, object]:
213249 "invalidSchedules" : invalid_jobs ,
214250 "startedJobsSinceStartup" : started_jobs ,
215251 "successfulJobsSinceStartup" : successful_jobs ,
252+ "failedJobsSinceStartup" : failed_jobs ,
216253 }
217254
218255
@@ -231,3 +268,44 @@ def get_cron_job_last_runs() -> dict[str, str | None]:
231268 job_last_runs [job .id ] = last_runs .get (job .id )
232269
233270 return job_last_runs
271+
272+
273+ def get_cron_job_diagnostics () -> dict [str , dict [str , object | None ]]:
274+ if _scheduler is None :
275+ return {}
276+
277+ with _state_lock :
278+ last_runs = {
279+ workflow_id : timestamp .isoformat ()
280+ for workflow_id , timestamp in _last_run_by_job .items ()
281+ }
282+ finished_runs = {
283+ workflow_id : timestamp .isoformat ()
284+ for workflow_id , timestamp in _last_finished_by_job .items ()
285+ }
286+ durations = dict (_last_duration_ms_by_job )
287+ exit_codes = dict (_last_exit_code_by_job )
288+ errors = dict (_last_error_by_job )
289+ running = {
290+ workflow_id
291+ for workflow_id , process in _running_process_by_job .items ()
292+ if process .poll () is None
293+ }
294+
295+ diagnostics : dict [str , dict [str , object | None ]] = {}
296+ for job in _scheduler .get_jobs ():
297+ next_run_time = None
298+ if job .next_run_time is not None :
299+ next_run_time = job .next_run_time .isoformat ()
300+
301+ diagnostics [job .id ] = {
302+ "lastStartedAt" : last_runs .get (job .id ),
303+ "lastFinishedAt" : finished_runs .get (job .id ),
304+ "lastDurationMs" : durations .get (job .id ),
305+ "lastExitCode" : exit_codes .get (job .id ),
306+ "lastError" : errors .get (job .id ),
307+ "nextRunAt" : next_run_time ,
308+ "running" : job .id in running ,
309+ }
310+
311+ return diagnostics
0 commit comments