@@ -12,10 +12,22 @@ import (
1212 "github.com/juanibiapina/gob/internal/tail"
1313)
1414
15- // followJob follows a job's output until it completes or is interrupted
16- // Returns true if job completed, false if interrupted
15+ // DefaultStuckTimeoutMs is the timeout when no historical data (5 minutes)
16+ const DefaultStuckTimeoutMs int64 = 5 * 60 * 1000
17+
18+ // NoOutputWindowMs is the constant "no output" window (1 minute)
19+ const NoOutputWindowMs int64 = 60 * 1000
20+
21+ // FollowResult represents the result of following a job
22+ type FollowResult struct {
23+ Completed bool // job finished running
24+ PossiblyStuck bool // job may be stuck (timed out without output)
25+ }
26+
27+ // followJob follows a job's output until it completes, is interrupted, or is detected as possibly stuck
28+ // avgDurationMs is the average duration of successful runs (0 if no history)
1729// stdoutPath is the full path to the stdout log file
18- func followJob (jobID string , pid int , stdoutPath string ) (bool , error ) {
30+ func followJob (jobID string , pid int , stdoutPath string , avgDurationMs int64 ) (FollowResult , error ) {
1931 // Derive stderr path from stdout path
2032 stderrPath := strings .Replace (stdoutPath , ".stdout.log" , ".stderr.log" , 1 )
2133
@@ -31,12 +43,26 @@ func followJob(jobID string, pid int, stdoutPath string) (bool, error) {
3143
3244 // Check if log files exist
3345 if _ , err := os .Stat (stdoutPath ); os .IsNotExist (err ) {
34- return false , fmt .Errorf ("stdout log file not found: %s" , stdoutPath )
46+ return FollowResult {} , fmt .Errorf ("stdout log file not found: %s" , stdoutPath )
3547 }
3648 if _ , err := os .Stat (stderrPath ); os .IsNotExist (err ) {
37- return false , fmt .Errorf ("stderr log file not found: %s" , stderrPath )
49+ return FollowResult {} , fmt .Errorf ("stderr log file not found: %s" , stderrPath )
3850 }
3951
52+ // Calculate stuck detection threshold
53+ // No data: 5 minutes
54+ // Has data: avg + 1 minute
55+ // Trigger: elapsed > threshold AND no output for 1 minute
56+ var stuckTimeoutMs int64
57+ if avgDurationMs == 0 {
58+ stuckTimeoutMs = DefaultStuckTimeoutMs
59+ } else {
60+ stuckTimeoutMs = avgDurationMs + NoOutputWindowMs
61+ }
62+
63+ stuckTimeout := time .Duration (stuckTimeoutMs ) * time .Millisecond
64+ noOutputWindow := time .Duration (NoOutputWindowMs ) * time .Millisecond
65+
4066 // Create follower
4167 follower := tail .NewFollower (os .Stdout )
4268
@@ -52,23 +78,37 @@ func followJob(jobID string, pid int, stdoutPath string) (bool, error) {
5278 signal .Notify (sigCh , syscall .SIGINT , syscall .SIGTERM )
5379
5480 // Track completion status
55- completed := false
81+ result := FollowResult {}
82+ startTime := time .Now ()
5683
57- // Monitor for process completion or signal
84+ // Monitor for process completion, signal, or stuck condition
5885 done := make (chan struct {})
5986 go func () {
6087 for {
6188 select {
6289 case <- done :
6390 return
6491 default :
92+ // Check if process completed
6593 if ! process .IsProcessRunning (pid ) {
6694 // Give a moment for any final output to be written
6795 time .Sleep (200 * time .Millisecond )
68- completed = true
96+ result .Completed = true
97+ follower .Stop ()
98+ return
99+ }
100+
101+ // Check for stuck condition
102+ // Trigger: elapsed > timeout AND no output for 1 minute
103+ elapsed := time .Since (startTime )
104+ timeSinceOutput := time .Since (follower .LastOutputTime ())
105+
106+ if elapsed > stuckTimeout && timeSinceOutput > noOutputWindow {
107+ result .PossiblyStuck = true
69108 follower .Stop ()
70109 return
71110 }
111+
72112 time .Sleep (100 * time .Millisecond )
73113 }
74114 }
@@ -83,5 +123,15 @@ func followJob(jobID string, pid int, stdoutPath string) (bool, error) {
83123
84124 follower .Wait ()
85125
86- return completed , nil
126+ return result , nil
127+ }
128+
129+ // CalculateStuckTimeout returns the stuck detection timeout based on average duration
130+ // No data: 5 minutes
131+ // Has data: avg + 1 minute
132+ func CalculateStuckTimeout (avgDurationMs int64 ) time.Duration {
133+ if avgDurationMs == 0 {
134+ return time .Duration (DefaultStuckTimeoutMs ) * time .Millisecond
135+ }
136+ return time .Duration (avgDurationMs + NoOutputWindowMs ) * time .Millisecond
87137}
0 commit comments