Skip to content

Commit 881b4c7

Browse files
committed
fix: retry primary model on transient errors even without fallback models
getEffectiveRetries returned 0 when no fallback models were configured, meaning retryable errors (5xx, timeouts) got zero retries. This caused Anthropic streaming 'Internal server error' to immediately surface as 'all models failed' instead of being retried with backoff. Changed getEffectiveRetries to always return DefaultFallbackRetries (2) when no explicit retry count is configured, regardless of whether fallback models exist. Retrying the same model on transient errors is always valuable. Fixes #1743 Assisted-By: cagent
1 parent 8df5bc5 commit 881b4c7

File tree

2 files changed

+55
-7
lines changed

2 files changed

+55
-7
lines changed

pkg/runtime/fallback.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -369,9 +369,10 @@ func getEffectiveCooldown(a *agent.Agent) time.Duration {
369369
}
370370

371371
// getEffectiveRetries returns the number of retries to use for the agent.
372-
// If no retries are explicitly configured (retries == 0) and fallback models
373-
// are configured, returns DefaultFallbackRetries to provide sensible retry
374-
// behavior out of the box.
372+
// If no retries are explicitly configured (retries == 0), returns
373+
// DefaultFallbackRetries to provide sensible retry behavior out of the box.
374+
// Retries apply to retryable errors (5xx, timeouts) on the same model,
375+
// regardless of whether fallback models are configured.
375376
//
376377
// Note: Users who explicitly want 0 retries can set retries: -1 in their config
377378
// (though this is an edge case - most users want some retries for resilience).
@@ -381,8 +382,7 @@ func getEffectiveRetries(a *agent.Agent) int {
381382
if retries < 0 {
382383
return 0
383384
}
384-
// 0 means "use default" when fallback models are configured
385-
if retries == 0 && len(a.FallbackModels()) > 0 {
385+
if retries == 0 {
386386
return DefaultFallbackRetries
387387
}
388388
return retries

pkg/runtime/fallback_test.go

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,11 @@ func TestIsRetryableModelError(t *testing.T) {
181181
err: errors.New("something weird happened"),
182182
expected: false,
183183
},
184+
{
185+
name: "anthropic streaming internal server error",
186+
err: fmt.Errorf("error receiving from stream: %w", errors.New(`received error while streaming: {"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"},"request_id":"req_test"}`)),
187+
expected: true,
188+
},
184189
}
185190

186191
for _, tt := range tests {
@@ -719,12 +724,12 @@ func TestGetEffectiveRetries(t *testing.T) {
719724
mockModel := &mockProvider{id: "test/model", stream: newStreamBuilder().AddContent("ok").AddStopWithUsage(1, 1).Build()}
720725
mockFallback := &mockProvider{id: "test/fallback", stream: newStreamBuilder().AddContent("ok").AddStopWithUsage(1, 1).Build()}
721726

722-
// Agent with no retries configured and no fallback models should return 0
727+
// Agent with no retries configured and no fallback models should still get default retries
723728
agentNoFallback := agent.New("no-fallback", "test",
724729
agent.WithModel(mockModel),
725730
)
726731
retries := getEffectiveRetries(agentNoFallback)
727-
assert.Equal(t, 0, retries, "no fallback models = no retries (nothing to retry to)")
732+
assert.Equal(t, DefaultFallbackRetries, retries, "no fallback models should still get default retries for transient errors")
728733

729734
// Agent with no retries configured but with fallback models should use default
730735
agentWithFallback := agent.New("with-fallback", "test",
@@ -877,6 +882,49 @@ func TestFallbackModelsClonedWithThinkingEnabled(t *testing.T) {
877882
})
878883
}
879884

885+
func TestPrimaryRetriesWithoutFallbackModels(t *testing.T) {
886+
synctest.Test(t, func(t *testing.T) {
887+
// Primary fails twice with retryable error (mimics Anthropic streaming internal
888+
// server error), then succeeds. No fallback models are configured.
889+
successStream := newStreamBuilder().
890+
AddContent("Success after transient failures").
891+
AddStopWithUsage(10, 5).
892+
Build()
893+
primary := &countingProvider{
894+
id: "primary/counting",
895+
failCount: 2,
896+
err: errors.New(`error receiving from stream: received error while streaming: {"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"}}`),
897+
stream: successStream,
898+
}
899+
900+
root := agent.New("root", "test",
901+
agent.WithModel(primary),
902+
// No fallback models
903+
)
904+
905+
tm := team.New(team.WithAgents(root))
906+
rt, err := NewLocalRuntime(tm, WithSessionCompaction(false), WithModelStore(mockModelStore{}))
907+
require.NoError(t, err)
908+
909+
sess := session.New(session.WithUserMessage("test"))
910+
sess.Title = "No Fallback Retry Test"
911+
912+
events := rt.RunStream(t.Context(), sess)
913+
914+
var gotContent bool
915+
for ev := range events {
916+
if choice, ok := ev.(*AgentChoiceEvent); ok {
917+
if choice.Content == "Success after transient failures" {
918+
gotContent = true
919+
}
920+
}
921+
}
922+
923+
assert.True(t, gotContent, "should recover from transient errors even without fallback models")
924+
assert.Equal(t, 3, primary.callCount, "primary should be called 3 times (2 failures + 1 success)")
925+
})
926+
}
927+
880928
// Verify interface compliance
881929
var (
882930
_ provider.Provider = (*mockProvider)(nil)

0 commit comments

Comments
 (0)