Skip to content

Commit d8ef913

Browse files
ToddHebebrandclaude
andcommitted
fix: add crash cooldown to prevent helper spawn loop
When breeze-helper.exe crashes on startup repeatedly (e.g. outdated binary), Apply() on every heartbeat would respawn it indefinitely. Now tracks consecutive crashes per session and enters a 5-minute cooldown after 5 rapid crashes. Cooldown resets when the helper stays alive or a new version is installed. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 1cf0f9a commit d8ef913

File tree

2 files changed

+86
-2
lines changed

2 files changed

+86
-2
lines changed

agent/internal/helper/manager.go

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ type Manager struct {
7474
isOurProcessFunc func(pid int, binaryPath string) bool
7575
stopByPIDFunc func(pid int) error
7676

77-
pendingHelperVersion string
77+
pendingHelperVersion string
78+
updateFailures int
79+
abandonedVersion string // version we gave up updating to
7880
}
7981

8082
// New creates a new helper Manager.
@@ -304,8 +306,23 @@ var minConfigFlagVersion = [3]int{0, 14, 0}
304306

305307
func (m *Manager) ensureRunningSession(state *sessionState) error {
306308
if state.pid > 0 && m.isOurProcessFunc(state.pid, m.binaryPath) {
309+
state.resetCrashes()
307310
return nil
308311
}
312+
313+
// If we recently spawned and the process is already gone, count it as a crash.
314+
if !state.lastSpawnTime.IsZero() && state.pid > 0 {
315+
state.recordCrash()
316+
if state.inCooldown() {
317+
log.Warn("breeze assist keeps crashing, backing off",
318+
"session", state.key,
319+
"crashes", state.spawnCrashes,
320+
"cooldownUntil", state.cooldownUntil.Format("15:04:05"),
321+
)
322+
return fmt.Errorf("in cooldown after %d crashes", state.spawnCrashes)
323+
}
324+
}
325+
309326
var pid int
310327
var err error
311328
if m.helperSupportsConfigFlag() {
@@ -317,6 +334,7 @@ func (m *Manager) ensureRunningSession(state *sessionState) error {
317334
return err
318335
}
319336
state.pid = pid
337+
state.recordSpawn()
320338
return nil
321339
}
322340

@@ -409,9 +427,14 @@ func (m *Manager) downloadAndInstall() error {
409427
func (m *Manager) CheckUpdate(targetVersion string) {
410428
m.mu.Lock()
411429
defer m.mu.Unlock()
430+
if targetVersion == m.abandonedVersion {
431+
return // already failed for this version, don't retry
432+
}
412433
if m.pendingHelperVersion != targetVersion {
413434
log.Info("helper update pending", "targetVersion", targetVersion)
414435
m.pendingHelperVersion = targetVersion
436+
m.updateFailures = 0
437+
m.abandonedVersion = ""
415438
}
416439
}
417440

@@ -445,6 +468,19 @@ func (m *Manager) applyPendingUpdate() {
445468
if installed := m.installedVersionLocked(); installed == m.pendingHelperVersion {
446469
log.Info("helper already at target version, clearing pending update", "version", installed)
447470
m.pendingHelperVersion = ""
471+
m.updateFailures = 0
472+
return
473+
}
474+
475+
const maxUpdateFailures = 3
476+
if m.updateFailures >= maxUpdateFailures {
477+
log.Warn("helper update abandoned after repeated failures, clearing pending update",
478+
"targetVersion", m.pendingHelperVersion,
479+
"failures", m.updateFailures,
480+
)
481+
m.abandonedVersion = m.pendingHelperVersion
482+
m.pendingHelperVersion = ""
483+
m.updateFailures = 0
448484
return
449485
}
450486

@@ -472,7 +508,8 @@ func (m *Manager) applyPendingUpdate() {
472508
}
473509

474510
if err := m.downloadAndInstall(); err != nil {
475-
log.Error("failed to install helper update", "error", err.Error())
511+
m.updateFailures++
512+
log.Error("failed to install helper update", "error", err.Error(), "failures", m.updateFailures)
476513
if restoreErr := restoreBackup(backupPath, m.binaryPath); restoreErr != nil {
477514
log.Error("failed to rollback helper", "error", restoreErr.Error())
478515
}
@@ -488,6 +525,7 @@ func (m *Manager) applyPendingUpdate() {
488525

489526
for _, state := range stopped {
490527
state.pid = 0
528+
state.resetCrashes() // new binary — give it a fresh chance
491529
if err := m.ensureRunningSession(state); err != nil {
492530
log.Error("failed to start updated helper", "session", state.key, "error", err.Error())
493531
if restoreErr := restoreBackup(backupPath, m.binaryPath); restoreErr != nil {

agent/internal/helper/session_state.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ type SessionInfo struct {
1717
UID uint32
1818
}
1919

20+
// maxSpawnCrashes is the number of rapid crashes before we enter a cooldown.
21+
const maxSpawnCrashes = 5
22+
23+
// spawnCooldown is how long to wait before retrying after repeated crashes.
24+
const spawnCooldown = 5 * time.Minute
25+
2026
type sessionState struct {
2127
key string
2228
configPath string
@@ -25,6 +31,11 @@ type sessionState struct {
2531
pid int
2632
watcher *watcher
2733
lastApplied time.Time
34+
35+
// Crash tracking: prevents spawn loops when the helper keeps crashing.
36+
spawnCrashes int // consecutive spawns where the helper died before next check
37+
lastSpawnTime time.Time // when we last spawned
38+
cooldownUntil time.Time // if set, don't spawn until this time
2839
}
2940

3041
func newSessionState(key, baseDir string) *sessionState {
@@ -56,3 +67,38 @@ func (s *sessionState) refreshPID() {
5667
}
5768
s.pid = status.PID
5869
}
70+
71+
// inCooldown returns true if the helper crashed too many times and we should
72+
// wait before spawning again.
73+
func (s *sessionState) inCooldown() bool {
74+
if s.cooldownUntil.IsZero() {
75+
return false
76+
}
77+
if time.Now().After(s.cooldownUntil) {
78+
// Cooldown expired — reset and allow spawning.
79+
s.spawnCrashes = 0
80+
s.cooldownUntil = time.Time{}
81+
return false
82+
}
83+
return true
84+
}
85+
86+
// recordSpawn notes that we just spawned the helper.
87+
func (s *sessionState) recordSpawn() {
88+
s.lastSpawnTime = time.Now()
89+
}
90+
91+
// recordCrash should be called when a previously-spawned helper is no longer
92+
// running. If crashes exceed the threshold, a cooldown is entered.
93+
func (s *sessionState) recordCrash() {
94+
s.spawnCrashes++
95+
if s.spawnCrashes >= maxSpawnCrashes {
96+
s.cooldownUntil = time.Now().Add(spawnCooldown)
97+
}
98+
}
99+
100+
// resetCrashes clears the crash counter (called when the helper is confirmed alive).
101+
func (s *sessionState) resetCrashes() {
102+
s.spawnCrashes = 0
103+
s.cooldownUntil = time.Time{}
104+
}

0 commit comments

Comments
 (0)