fix: retry primary model on transient errors even without fallback models

aheritier · aheritier · commit 881b4c77fad9 · 2026-02-15T23:47:20.000+01:00
getEffectiveRetries returned 0 when no fallback models were configured, meaning retryable errors (5xx, timeouts) got zero retries. This caused Anthropic streaming 'Internal server error' to immediately surface as 'all models failed' instead of being retried with backoff. Changed getEffectiveRetries to always return DefaultFallbackRetries (2) when no explicit retry count is configured, regardless of whether fallback models exist. Retrying the same model on transient errors is always valuable. Fixes #1743 Assisted-By: cagent
diff --git a/pkg/runtime/fallback.go b/pkg/runtime/fallback.go
@@ -369,9 +369,10 @@ func getEffectiveCooldown(a *agent.Agent) time.Duration {
 }
 
 // getEffectiveRetries returns the number of retries to use for the agent.
-// If no retries are explicitly configured (retries == 0) and fallback models
-// are configured, returns DefaultFallbackRetries to provide sensible retry
-// behavior out of the box.
+// If no retries are explicitly configured (retries == 0), returns
+// DefaultFallbackRetries to provide sensible retry behavior out of the box.
+// Retries apply to retryable errors (5xx, timeouts) on the same model,
+// regardless of whether fallback models are configured.
 //
 // Note: Users who explicitly want 0 retries can set retries: -1 in their config
 // (though this is an edge case - most users want some retries for resilience).
@@ -381,8 +382,7 @@ func getEffectiveRetries(a *agent.Agent) int {
 	if retries < 0 {
 		return 0
 	}
-	// 0 means "use default" when fallback models are configured
-	if retries == 0 && len(a.FallbackModels()) > 0 {
+	if retries == 0 {
 		return DefaultFallbackRetries
 	}
 	return retries
diff --git a/pkg/runtime/fallback_test.go b/pkg/runtime/fallback_test.go
@@ -181,6 +181,11 @@ func TestIsRetryableModelError(t *testing.T) {
 			err:      errors.New("something weird happened"),
 			expected: false,
 		},
+		{
+			name:     "anthropic streaming internal server error",
+			err:      fmt.Errorf("error receiving from stream: %w", errors.New(`received error while streaming: {"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"},"request_id":"req_test"}`)),
+			expected: true,
+		},
 	}
 
 	for _, tt := range tests {
@@ -719,12 +724,12 @@ func TestGetEffectiveRetries(t *testing.T) {
 	mockModel := &mockProvider{id: "test/model", stream: newStreamBuilder().AddContent("ok").AddStopWithUsage(1, 1).Build()}
 	mockFallback := &mockProvider{id: "test/fallback", stream: newStreamBuilder().AddContent("ok").AddStopWithUsage(1, 1).Build()}
 
-	// Agent with no retries configured and no fallback models should return 0
+	// Agent with no retries configured and no fallback models should still get default retries
 	agentNoFallback := agent.New("no-fallback", "test",
 		agent.WithModel(mockModel),
 	)
 	retries := getEffectiveRetries(agentNoFallback)
-	assert.Equal(t, 0, retries, "no fallback models = no retries (nothing to retry to)")
+	assert.Equal(t, DefaultFallbackRetries, retries, "no fallback models should still get default retries for transient errors")
 
 	// Agent with no retries configured but with fallback models should use default
 	agentWithFallback := agent.New("with-fallback", "test",
@@ -877,6 +882,49 @@ func TestFallbackModelsClonedWithThinkingEnabled(t *testing.T) {
 	})
 }
 
+func TestPrimaryRetriesWithoutFallbackModels(t *testing.T) {
+	synctest.Test(t, func(t *testing.T) {
+		// Primary fails twice with retryable error (mimics Anthropic streaming internal
+		// server error), then succeeds. No fallback models are configured.
+		successStream := newStreamBuilder().
+			AddContent("Success after transient failures").
+			AddStopWithUsage(10, 5).
+			Build()
+		primary := &countingProvider{
+			id:        "primary/counting",
+			failCount: 2,
+			err:       errors.New(`error receiving from stream: received error while streaming: {"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"}}`),
+			stream:    successStream,
+		}
+
+		root := agent.New("root", "test",
+			agent.WithModel(primary),
+			// No fallback models
+		)
+
+		tm := team.New(team.WithAgents(root))
+		rt, err := NewLocalRuntime(tm, WithSessionCompaction(false), WithModelStore(mockModelStore{}))
+		require.NoError(t, err)
+
+		sess := session.New(session.WithUserMessage("test"))
+		sess.Title = "No Fallback Retry Test"
+
+		events := rt.RunStream(t.Context(), sess)
+
+		var gotContent bool
+		for ev := range events {
+			if choice, ok := ev.(*AgentChoiceEvent); ok {
+				if choice.Content == "Success after transient failures" {
+					gotContent = true
+				}
+			}
+		}
+
+		assert.True(t, gotContent, "should recover from transient errors even without fallback models")
+		assert.Equal(t, 3, primary.callCount, "primary should be called 3 times (2 failures + 1 success)")
+	})
+}
+
 // Verify interface compliance
 var (
 	_ provider.Provider = (*mockProvider)(nil)