Skip to content

Commit 4d01cd4

Browse files
committed
feat: add stuck job detection for run and await commands
Detect potentially stuck jobs and return early while the job continues running in the background. This prevents indefinite waits when a job hangs without producing output. - Timeout: avg successful duration + 1 min (or 5 min if no history) - Triggers when elapsed > timeout AND no output for 1 minute - Shows helpful commands: gob stdout, gob await, gob stop - Also applies to start -f and restart -f
1 parent f4f6d2d commit 4d01cd4

File tree

8 files changed

+179
-30
lines changed

8 files changed

+179
-30
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- **Stuck job detection**: `gob run` and `gob await` now detect potentially stuck jobs and return early
13+
- Timeout: average successful duration + 1 minute (or 5 minutes if no historical data)
14+
- Triggers when: elapsed time exceeds timeout AND no output for 1 minute
15+
- Displays stuck detection timeout when starting a job (e.g., "Stuck detection: timeout after 5m")
16+
- When triggered, shows helpful commands: `gob stdout`, `gob await`, `gob stop`
17+
- Job continues running in background - only the wait is aborted
18+
- Also applies to `gob start -f` and `gob restart -f`
19+
1020
### Changed
1121

1222
- **Stats by outcome**: `gob run` and `gob add` now show separate expected durations for successful and failed runs

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ No more "can you check if that's still running?" No more copy-pasting logs throu
3232
- **Reliable shutdowns** - Stop, restart, and shutdown verify every child process in the tree is gone
3333
- **Job persistence** - Jobs survive daemon restarts with SQLite-backed state
3434
- **Run history** - Track execution history and statistics for repeated commands
35+
- **Stuck detection** - Automatically detects jobs that may be stuck and returns early, while the job continues running
3536

3637
## Installation
3738

@@ -184,9 +185,17 @@ Do NOT use `gob` for:
184185
- `gob await <job_id>` - Wait for job to finish, stream output
185186
- `gob await-any` - Wait for whichever job finishes first
186187
- `gob list` - List jobs with IDs, status, and descriptions
188+
- `gob stdout <job_id>` - View current output (useful if job may be stuck)
187189
- `gob stop <job_id>` - Graceful stop
188190
- `gob restart <job_id>` - Stop + start
189191

192+
### Stuck Detection
193+
194+
`gob run` and `gob await` automatically detect potentially stuck jobs:
195+
- Timeout: avg duration + 1 min (or 5 min if no history), triggers if no output for 1 min
196+
- Job continues running in background
197+
- Use `gob stdout <id>` to check output, `gob await <id>` to continue waiting
198+
190199
### Examples
191200

192201
Servers and long-running:

cmd/await.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,32 @@ Exit codes:
6161
commandStr := strings.Join(job.Command, " ")
6262

6363
if job.Status == "running" {
64+
// Fetch stats for stuck detection
65+
var avgDurationMs int64
66+
stats, err := client.Stats(jobID)
67+
if err == nil && stats != nil && stats.SuccessCount >= 3 {
68+
avgDurationMs = stats.AvgDurationMs
69+
}
70+
stuckTimeout := CalculateStuckTimeout(avgDurationMs)
71+
6472
fmt.Printf("Awaiting job %s: %s\n", job.ID, commandStr)
73+
fmt.Printf(" Stuck detection: timeout after %s\n", formatDuration(stuckTimeout))
6574

6675
// Follow the output until completion
67-
completed, err := followJob(job.ID, job.PID, job.StdoutPath)
76+
followResult, err := followJob(job.ID, job.PID, job.StdoutPath, avgDurationMs)
6877
if err != nil {
6978
return err
7079
}
7180

72-
if !completed {
81+
if followResult.PossiblyStuck {
82+
fmt.Printf("\nJob %s possibly stuck (no output for 1m)\n", job.ID)
83+
fmt.Printf(" gob stdout %s # check current output\n", job.ID)
84+
fmt.Printf(" gob await %s # continue waiting with output\n", job.ID)
85+
fmt.Printf(" gob stop %s # stop the job\n", job.ID)
86+
return nil
87+
}
88+
89+
if !followResult.Completed {
7390
fmt.Printf("\nJob %s continues running in background\n", job.ID)
7491
return nil
7592
}

cmd/follow.go

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,22 @@ import (
1212
"github.com/juanibiapina/gob/internal/tail"
1313
)
1414

15-
// followJob follows a job's output until it completes or is interrupted
16-
// Returns true if job completed, false if interrupted
15+
// DefaultStuckTimeoutMs is the timeout when no historical data (5 minutes)
16+
const DefaultStuckTimeoutMs int64 = 5 * 60 * 1000
17+
18+
// NoOutputWindowMs is the constant "no output" window (1 minute)
19+
const NoOutputWindowMs int64 = 60 * 1000
20+
21+
// FollowResult represents the result of following a job
22+
type FollowResult struct {
23+
Completed bool // job finished running
24+
PossiblyStuck bool // job may be stuck (timed out without output)
25+
}
26+
27+
// followJob follows a job's output until it completes, is interrupted, or is detected as possibly stuck
28+
// avgDurationMs is the average duration of successful runs (0 if no history)
1729
// stdoutPath is the full path to the stdout log file
18-
func followJob(jobID string, pid int, stdoutPath string) (bool, error) {
30+
func followJob(jobID string, pid int, stdoutPath string, avgDurationMs int64) (FollowResult, error) {
1931
// Derive stderr path from stdout path
2032
stderrPath := strings.Replace(stdoutPath, ".stdout.log", ".stderr.log", 1)
2133

@@ -31,12 +43,26 @@ func followJob(jobID string, pid int, stdoutPath string) (bool, error) {
3143

3244
// Check if log files exist
3345
if _, err := os.Stat(stdoutPath); os.IsNotExist(err) {
34-
return false, fmt.Errorf("stdout log file not found: %s", stdoutPath)
46+
return FollowResult{}, fmt.Errorf("stdout log file not found: %s", stdoutPath)
3547
}
3648
if _, err := os.Stat(stderrPath); os.IsNotExist(err) {
37-
return false, fmt.Errorf("stderr log file not found: %s", stderrPath)
49+
return FollowResult{}, fmt.Errorf("stderr log file not found: %s", stderrPath)
3850
}
3951

52+
// Calculate stuck detection threshold
53+
// No data: 5 minutes
54+
// Has data: avg + 1 minute
55+
// Trigger: elapsed > threshold AND no output for 1 minute
56+
var stuckTimeoutMs int64
57+
if avgDurationMs == 0 {
58+
stuckTimeoutMs = DefaultStuckTimeoutMs
59+
} else {
60+
stuckTimeoutMs = avgDurationMs + NoOutputWindowMs
61+
}
62+
63+
stuckTimeout := time.Duration(stuckTimeoutMs) * time.Millisecond
64+
noOutputWindow := time.Duration(NoOutputWindowMs) * time.Millisecond
65+
4066
// Create follower
4167
follower := tail.NewFollower(os.Stdout)
4268

@@ -52,23 +78,37 @@ func followJob(jobID string, pid int, stdoutPath string) (bool, error) {
5278
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
5379

5480
// Track completion status
55-
completed := false
81+
result := FollowResult{}
82+
startTime := time.Now()
5683

57-
// Monitor for process completion or signal
84+
// Monitor for process completion, signal, or stuck condition
5885
done := make(chan struct{})
5986
go func() {
6087
for {
6188
select {
6289
case <-done:
6390
return
6491
default:
92+
// Check if process completed
6593
if !process.IsProcessRunning(pid) {
6694
// Give a moment for any final output to be written
6795
time.Sleep(200 * time.Millisecond)
68-
completed = true
96+
result.Completed = true
97+
follower.Stop()
98+
return
99+
}
100+
101+
// Check for stuck condition
102+
// Trigger: elapsed > timeout AND no output for 1 minute
103+
elapsed := time.Since(startTime)
104+
timeSinceOutput := time.Since(follower.LastOutputTime())
105+
106+
if elapsed > stuckTimeout && timeSinceOutput > noOutputWindow {
107+
result.PossiblyStuck = true
69108
follower.Stop()
70109
return
71110
}
111+
72112
time.Sleep(100 * time.Millisecond)
73113
}
74114
}
@@ -83,5 +123,15 @@ func followJob(jobID string, pid int, stdoutPath string) (bool, error) {
83123

84124
follower.Wait()
85125

86-
return completed, nil
126+
return result, nil
127+
}
128+
129+
// CalculateStuckTimeout returns the stuck detection timeout based on average duration
130+
// No data: 5 minutes
131+
// Has data: avg + 1 minute
132+
func CalculateStuckTimeout(avgDurationMs int64) time.Duration {
133+
if avgDurationMs == 0 {
134+
return time.Duration(DefaultStuckTimeoutMs) * time.Millisecond
135+
}
136+
return time.Duration(avgDurationMs+NoOutputWindowMs) * time.Millisecond
87137
}

cmd/restart.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,25 @@ Exit codes:
7474

7575
// If follow flag is set, follow the output
7676
if restartFollow {
77-
completed, err := followJob(jobID, job.PID, job.StdoutPath)
77+
// Fetch stats for stuck detection
78+
var avgDurationMs int64
79+
stats, statsErr := client.Stats(jobID)
80+
if statsErr == nil && stats != nil && stats.SuccessCount >= 3 {
81+
avgDurationMs = stats.AvgDurationMs
82+
}
83+
stuckTimeout := CalculateStuckTimeout(avgDurationMs)
84+
fmt.Printf(" Stuck detection: timeout after %s\n", formatDuration(stuckTimeout))
85+
86+
followResult, err := followJob(jobID, job.PID, job.StdoutPath, avgDurationMs)
7887
if err != nil {
7988
return err
8089
}
81-
if completed {
90+
if followResult.PossiblyStuck {
91+
fmt.Printf("\nJob %s possibly stuck (no output for 1m)\n", jobID)
92+
fmt.Printf(" gob stdout %s # check current output\n", jobID)
93+
fmt.Printf(" gob await %s # continue waiting with output\n", jobID)
94+
fmt.Printf(" gob stop %s # stop the job\n", jobID)
95+
} else if followResult.Completed {
8296
fmt.Printf("\nJob %s completed\n", jobID)
8397
} else {
8498
fmt.Printf("\nJob %s continues running in background\n", jobID)

cmd/run.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,11 +118,19 @@ Exit codes:
118118

119119
commandStr := strings.Join(commandArgs, " ")
120120

121+
// Determine average duration for stuck detection
122+
var avgDurationMs int64
123+
if result.Stats != nil && result.Stats.SuccessCount >= 3 {
124+
avgDurationMs = result.Stats.AvgDurationMs
125+
}
126+
stuckTimeout := CalculateStuckTimeout(avgDurationMs)
127+
121128
// Print message based on action
122129
if result.Action == "already_running" {
123130
startedAt, _ := time.Parse(time.RFC3339, result.Job.StartedAt)
124131
duration := formatDuration(time.Since(startedAt))
125132
fmt.Printf("Job %s already running (since %s ago), attaching...\n", result.Job.ID, duration)
133+
fmt.Printf(" Stuck detection: timeout after %s\n", formatDuration(stuckTimeout))
126134
} else {
127135
fmt.Printf("Running job %s: %s\n", result.Job.ID, commandStr)
128136

@@ -139,15 +147,24 @@ Exit codes:
139147
formatDuration(time.Duration(result.Stats.FailureAvgDurationMs)*time.Millisecond))
140148
}
141149
}
150+
fmt.Printf(" Stuck detection: timeout after %s\n", formatDuration(stuckTimeout))
142151
}
143152

144153
// Follow the output until completion
145-
completed, err := followJob(result.Job.ID, result.Job.PID, result.Job.StdoutPath)
154+
followResult, err := followJob(result.Job.ID, result.Job.PID, result.Job.StdoutPath, avgDurationMs)
146155
if err != nil {
147156
return err
148157
}
149158

150-
if !completed {
159+
if followResult.PossiblyStuck {
160+
fmt.Printf("\nJob %s possibly stuck (no output for 1m)\n", result.Job.ID)
161+
fmt.Printf(" gob stdout %s # check current output\n", result.Job.ID)
162+
fmt.Printf(" gob await %s # continue waiting with output\n", result.Job.ID)
163+
fmt.Printf(" gob stop %s # stop the job\n", result.Job.ID)
164+
return nil
165+
}
166+
167+
if !followResult.Completed {
151168
fmt.Printf("\nJob %s continues running in background\n", result.Job.ID)
152169
fmt.Printf(" gob await %s # wait for completion with live output\n", result.Job.ID)
153170
fmt.Printf(" gob stop %s # stop the job\n", result.Job.ID)

cmd/start.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,25 @@ Exit codes:
6363

6464
// If follow flag is set, follow the output
6565
if startFollow {
66-
completed, err := followJob(jobID, job.PID, job.StdoutPath)
66+
// Fetch stats for stuck detection
67+
var avgDurationMs int64
68+
stats, statsErr := client.Stats(jobID)
69+
if statsErr == nil && stats != nil && stats.SuccessCount >= 3 {
70+
avgDurationMs = stats.AvgDurationMs
71+
}
72+
stuckTimeout := CalculateStuckTimeout(avgDurationMs)
73+
fmt.Printf(" Stuck detection: timeout after %s\n", formatDuration(stuckTimeout))
74+
75+
followResult, err := followJob(jobID, job.PID, job.StdoutPath, avgDurationMs)
6776
if err != nil {
6877
return err
6978
}
70-
if completed {
79+
if followResult.PossiblyStuck {
80+
fmt.Printf("\nJob %s possibly stuck (no output for 1m)\n", jobID)
81+
fmt.Printf(" gob stdout %s # check current output\n", jobID)
82+
fmt.Printf(" gob await %s # continue waiting with output\n", jobID)
83+
fmt.Printf(" gob stop %s # stop the job\n", jobID)
84+
} else if followResult.Completed {
7185
fmt.Printf("\nJob %s completed\n", jobID)
7286
} else {
7387
fmt.Printf("\nJob %s continues running in background\n", jobID)

internal/tail/follow.go

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,14 @@ type FileSource struct {
6060

6161
// Follower manages following multiple files with support for dynamic source addition
6262
type Follower struct {
63-
w io.Writer
64-
mu sync.Mutex
65-
sources map[string]bool // tracks which paths are already being followed
66-
errCh chan error
67-
wg sync.WaitGroup
68-
done chan struct{}
69-
stopped bool
63+
w io.Writer
64+
mu sync.Mutex
65+
sources map[string]bool // tracks which paths are already being followed
66+
errCh chan error
67+
wg sync.WaitGroup
68+
done chan struct{}
69+
stopped bool
70+
lastOutputTime time.Time // time of last output written
7071
}
7172

7273
// SystemLogTag is the prefix used for system log messages (same length as job IDs)
@@ -86,13 +87,26 @@ func (f *Follower) SystemLog(format string, args ...interface{}) {
8687
// NewFollower creates a new Follower that writes to the given writer
8788
func NewFollower(w io.Writer) *Follower {
8889
return &Follower{
89-
w: w,
90-
sources: make(map[string]bool),
91-
errCh: make(chan error, 100),
92-
done: make(chan struct{}),
90+
w: w,
91+
sources: make(map[string]bool),
92+
errCh: make(chan error, 100),
93+
done: make(chan struct{}),
94+
lastOutputTime: time.Now(), // initialize to now so we don't immediately trigger stuck detection
9395
}
9496
}
9597

98+
// LastOutputTime returns the time of the last output written (thread-safe)
99+
func (f *Follower) LastOutputTime() time.Time {
100+
f.mu.Lock()
101+
defer f.mu.Unlock()
102+
return f.lastOutputTime
103+
}
104+
105+
// updateLastOutputTime updates the last output time (must be called with mu held)
106+
func (f *Follower) updateLastOutputTime() {
107+
f.lastOutputTime = time.Now()
108+
}
109+
96110
// AddSource adds a new file source to follow. If the source is already being
97111
// followed, this is a no-op.
98112
func (f *Follower) AddSource(source FileSource) {
@@ -107,7 +121,7 @@ func (f *Follower) AddSource(source FileSource) {
107121
f.wg.Add(1)
108122
go func() {
109123
defer f.wg.Done()
110-
err := followWithPrefix(source.Path, source.Prefix, f.w, &f.mu, f.done)
124+
err := followWithPrefix(source.Path, source.Prefix, f.w, &f.mu, f.done, f.updateLastOutputTime)
111125
if err != nil {
112126
f.errCh <- err
113127
}
@@ -149,7 +163,8 @@ func FollowMultiple(sources []FileSource, w io.Writer) error {
149163
}
150164

151165
// followWithPrefix follows a file and prefixes each line with the given prefix
152-
func followWithPrefix(filePath string, prefix string, w io.Writer, mu *sync.Mutex, done <-chan struct{}) error {
166+
// onOutput is called (with mu held) each time output is written
167+
func followWithPrefix(filePath string, prefix string, w io.Writer, mu *sync.Mutex, done <-chan struct{}, onOutput func()) error {
153168
file, err := os.Open(filePath)
154169
if err != nil {
155170
return err
@@ -192,6 +207,9 @@ func followWithPrefix(filePath string, prefix string, w io.Writer, mu *sync.Mute
192207
w.Write([]byte(prefix))
193208
}
194209
w.Write(line)
210+
if onOutput != nil {
211+
onOutput()
212+
}
195213
mu.Unlock()
196214

197215
data = data[idx+1:]

0 commit comments

Comments
 (0)