docker · dgageot · Jan 17, 2026 · Jan 16, 2026 · Jan 16, 2026
@@ -451,15 +451,15 @@
         },
         "provider_opts": {
           "type": "object",
-          "description": "Provider-specific options. dmr: runtime_flags. anthropic: interleaved_thinking (boolean, default false). openai/anthropic/google: rerank_prompt (string) to fully override the system prompt used for RAG reranking (advanced - prefer using results.reranking.criteria for domain-specific guidance).",
+          "description": "Provider-specific options. dmr: runtime_flags. anthropic/amazon-bedrock (Claude): interleaved_thinking (boolean, default true). openai/anthropic/google: rerank_prompt (string) to fully override the system prompt used for RAG reranking (advanced - prefer using results.reranking.criteria for domain-specific guidance).",
           "additionalProperties": true
         },
         "track_usage": {
           "type": "boolean",
           "description": "Whether to track usage"
         },
         "thinking_budget": {
-          "description": "Controls reasoning effort/budget. OpenAI: string levels ('minimal','low','medium','high'). Anthropic: integer token budget (1024-32768). Gemini: integer token budget (-1 for unlimited, 0 to disable, 24576 max).",
+          "description": "Controls reasoning effort/budget. OpenAI: string levels ('minimal','low','medium','high'), default 'medium'. Anthropic: integer token budget (1024-32768), default 8192. Amazon Bedrock (Claude): same as Anthropic. Google Gemini 2.5: integer token budget (-1 for dynamic, 0 to disable, 24576 max), default -1. Google Gemini 3: string levels ('minimal' Flash only,'low','medium','high'), default 'high' for Pro, 'medium' for Flash.",
           "oneOf": [
             {
               "type": "string",
@@ -469,21 +469,23 @@
                 "medium",
                 "high"
               ],
-              "description": "Reasoning effort level (OpenAI)"
+              "description": "Reasoning effort level (OpenAI, Gemini 3)"
             },
             {
               "type": "integer",
               "minimum": -1,
               "maximum": 32768,
-              "description": "Token budget for extended thinking (Anthropic, Google)"
+              "description": "Token budget for extended thinking (Anthropic, Bedrock Claude, Gemini 2.5)"
             }
           ],
           "examples": [
             "minimal",
             "low",
             "medium",
             "high",
+            -1,
             1024,
+            8192,
             32768
           ]
         },

@@ -283,9 +283,11 @@ models:
 
 Determine how much the model should think by setting the `thinking_budget`
 
-- **OpenAI**: use effort levels — `minimal`, `low`, `medium`, `high`
-- **Anthropic**: set an integer token budget. Range is 1024–32768; must be strictly less than `max_tokens`.
-- **Google (Gemini)**: set an integer token budget. `0` -> disable thinking, `-1` -> dynamic thinking (model decides). Most models: 0–24576 tokens. Gemini 2.5 Pro: 128–32768 tokens (and cannot disable thinking). 
+- **OpenAI**: use effort levels — `minimal`, `low`, `medium`, `high`. Default: `medium`
+- **Anthropic**: set an integer token budget. Range is 1024–32768; must be strictly less than `max_tokens`. Default: `8192` with `interleaved_thinking: true`
+- **Google (Gemini 2.5)**: set an integer token budget. `0` -> disable thinking, `-1` -> dynamic thinking (model decides). Default: `-1` (dynamic)
+- **Google (Gemini 3)**: use effort levels — `minimal` (Flash only), `low`, `medium`, `high`. Default: `high` for Pro, `medium` for Flash
+- **Amazon Bedrock (Claude models)**: set an integer token budget, same as Anthropic. Default: `8192` with `interleaved_thinking: true`
 
 Examples (OpenAI):
 
@@ -317,7 +319,7 @@ agents:
     instruction: you are a helpful assistant that doesn't think very much
 ```
 
-Examples (Google):
+Examples (Google Gemini 2.5 - token-based):
 
 ```yaml
 models:
@@ -329,7 +331,7 @@ models:
   gemini-dynamic:
     provider: google
     model: gemini-2.5-flash
-    thinking_budget: -1  # Dynamic thinking (model decides)
+    thinking_budget: -1  # Dynamic thinking (model decides) - this is the default
 
   gemini-fixed:
     provider: google
@@ -342,29 +344,101 @@ agents:
     instruction: you are a helpful assistant
 ```
 
-#### Interleaved Thinking (Anthropic)
+Examples (Google Gemini 3 - level-based):
 
-Anthropic's interleaved thinking feature uses the Beta Messages API to provide tool calling during model reasoning. You can control this behavior using the `interleaved_thinking` provider option:
+```yaml
+models:
+  # Gemini 3 Pro: supports "low" and "high" levels
+  gemini-3-pro-high:
+    provider: google
+    model: gemini-3-pro
+    thinking_budget: high  # Default for Pro models
+
+  gemini-3-pro-low:
+    provider: google
+    model: gemini-3-pro
+    thinking_budget: low
+
+  # Gemini 3 Flash: supports "minimal", "low", "medium", "high" levels
+  gemini-3-flash-medium:
+    provider: google
+    model: gemini-3-flash
+    thinking_budget: medium  # Default for Flash models
+
+  gemini-3-flash-minimal:
+    provider: google
+    model: gemini-3-flash
+    thinking_budget: minimal
+
+agents:
+  root:
+    model: gemini-3-pro-high
+    instruction: you are a helpful assistant
+```
+
+Examples (Amazon Bedrock Claude):
+
+```yaml
+models:
+  bedrock-claude:
+    provider: amazon-bedrock
+    model: global.anthropic.claude-sonnet-4-5-20250929-v1:0
+    # thinking_budget defaults to 8192 and interleaved_thinking defaults to true for Claude models
+    provider_opts:
+      region: us-east-1
+
+  bedrock-claude-custom:
+    provider: amazon-bedrock
+    model: anthropic.claude-sonnet-4-20250514-v1:0
+    thinking_budget: 16384  # Override default
+    provider_opts:
+      region: eu-west-1
+      interleaved_thinking: true
+
+agents:
+  root:
+    model: bedrock-claude
+    instruction: you are a helpful assistant
+```
+
+#### Interleaved Thinking (Anthropic and Bedrock Claude)
+
+Anthropic's interleaved thinking feature uses the Beta Messages API to provide tool calling during model reasoning. This is now enabled by default for both `anthropic` and `amazon-bedrock` (Claude models) providers. You can control this behavior using the `interleaved_thinking` provider option:
 
 ```yaml
 models:
   claude:
     provider: anthropic
     model: claude-sonnet-4-5-20250929
-    thinking_budget: 8192  # Optional: defaults to 16384 when interleaved thinking is enabled
+    # thinking_budget defaults to 8192
+    # interleaved_thinking defaults to true
     provider_opts:
-      interleaved_thinking: true   # Enable interleaved thinking (default: false)
+      interleaved_thinking: false  # Disable if needed
+
+  bedrock-claude:
+    provider: amazon-bedrock
+    model: global.anthropic.claude-sonnet-4-5-20250929-v1:0
+    # thinking_budget defaults to 8192 for Claude models
+    # interleaved_thinking defaults to true for Claude models
+    provider_opts:
+      region: us-east-1
+      interleaved_thinking: false  # Disable if needed
 ```
 
 Notes:
 
-- **OpenAI**: If an invalid effort value is set, the request will fail with a clear error
-- **Anthropic**: Values < 1024 or ≥ `max_tokens` are ignored (warning logged). When `interleaved_thinking` is enabled,
-Docker `cagent` uses Anthropic's Beta Messages API with a default thinking budget of 16384 tokens if not specified
-- **Google**: 
+- **OpenAI**: If an invalid effort value is set, the request will fail with a clear error. Default: `medium`
+- **Anthropic**: Values < 1024 or ≥ `max_tokens` are ignored (warning logged). Default: `thinking_budget: 8192` with `interleaved_thinking: true`
+- **Amazon Bedrock (Claude)**: Same behavior as Anthropic. Non-Claude Bedrock models are not affected by defaults
+- **Google (Gemini 2.5)**: 
   - Most models support values between -1 and 24576 tokens. Set to `0` to disable, `-1` for dynamic thinking
   - Gemini 2.5 Pro: supports 128–32768 tokens. Cannot be disabled (minimum 128)
   - Gemini 2.5 Flash-Lite: supports 512–24576 tokens. Set to `0` to disable, `-1` for dynamic thinking
+  - Default: `-1` (dynamic thinking)
+- **Google (Gemini 3)**:
+  - Uses effort levels instead of token budgets: `minimal` (Flash only), `low`, `medium`, `high`
+  - Gemini 3 Pro default: `high`
+  - Gemini 3 Flash default: `medium`
 - For unsupported providers, `thinking_budget` has no effect
 - Debug logs include the applied effort (e.g., "OpenAI request using thinking_budget", "Gemini request using thinking_budget")
 

@@ -45,37 +45,56 @@ func TestExec_OpenAI_gpt5_1(t *testing.T) {
 func TestExec_OpenAI_gpt5_codex(t *testing.T) {
 	out := cagentExec(t, "testdata/basic.yaml", "--model=openai/gpt-5-codex", "What's 2+2?")
 
-	require.Equal(t, "\n--- Agent: root ---\n**Preparing to answer question 4**2 + 2 = 4.", out)
+	// Model reasoning summary varies, just check for the core response
+	require.Contains(t, out, "--- Agent: root ---")
+	require.Contains(t, out, "2 + 2 = 4")
 }
 
 func TestExec_Anthropic(t *testing.T) {
 	out := cagentExec(t, "testdata/basic.yaml", "--model=anthropic/claude-sonnet-4-0", "What's 2+2?")
 
-	require.Equal(t, "\n--- Agent: root ---\n2 + 2 = 4", out)
+	// With interleaved thinking enabled by default, Anthropic responses include thinking content
+	require.Contains(t, out, "--- Agent: root ---")
+	require.Contains(t, out, "2 + 2 = 4")
 }
 
 func TestExec_Anthropic_ToolCall(t *testing.T) {
 	out := cagentExec(t, "testdata/fs_tools.yaml", "--model=anthropic/claude-sonnet-4-0", "How many files in testdata/working_dir? Only output the number.")
 
-	require.Equal(t, "\n--- Agent: root ---\n\nCalling list_directory(path: \"testdata/working_dir\")\n\nlist_directory response → \"FILE README.me\\n\"\n1", out)
+	// With interleaved thinking enabled by default, Anthropic responses include thinking content
+	require.Contains(t, out, "--- Agent: root ---")
+	require.Contains(t, out, `Calling list_directory(path: "testdata/working_dir")`)
+	require.Contains(t, out, `list_directory response → "FILE README.me\n"`)
+	// The response should end with "1" (the count)
+	require.True(t, out != "" && out[len(out)-1] == '1', "response should end with '1'")
 }
 
 func TestExec_Anthropic_AgentsMd(t *testing.T) {
 	out := cagentExec(t, "testdata/agents-md.yaml", "--model=anthropic/claude-sonnet-4-0", "What's 2+2?")
 
-	require.Equal(t, "\n--- Agent: root ---\n2 + 2 = 4", out)
+	// With interleaved thinking enabled by default, Anthropic responses include thinking content
+	require.Contains(t, out, "--- Agent: root ---")
+	require.Contains(t, out, "2 + 2 = 4")
 }
 
 func TestExec_Gemini(t *testing.T) {
 	out := cagentExec(t, "testdata/basic.yaml", "--model=google/gemini-2.5-flash", "What's 2+2?")
 
-	require.Equal(t, "\n--- Agent: root ---\n2 + 2 = 4", out)
+	// With thinking enabled by default (dynamic thinking for Gemini 2.5), responses may include thinking content
+	require.Contains(t, out, "--- Agent: root ---")
+	// The response should contain the answer "4" somewhere
+	require.Contains(t, out, "4")
 }
 
 func TestExec_Gemini_ToolCall(t *testing.T) {
 	out := cagentExec(t, "testdata/fs_tools.yaml", "--model=google/gemini-2.5-flash", "How many files in testdata/working_dir? Only output the number.")
 
-	require.Equal(t, "\n--- Agent: root ---\n\nCalling list_directory(path: \"testdata/working_dir\")\n\nlist_directory response → \"FILE README.me\\n\"\n1", out)
+	// With thinking enabled by default (dynamic thinking for Gemini 2.5), responses include thinking content
+	require.Contains(t, out, "--- Agent: root ---")
+	require.Contains(t, out, `Calling list_directory(path: "testdata/working_dir")`)
+	require.Contains(t, out, `list_directory response → "FILE README.me\n"`)
+	// The response should end with "1" (the count)
+	require.True(t, out != "" && out[len(out)-1] == '1', "response should end with '1'")
 }
 
 func TestExec_Mistral(t *testing.T) {

@@ -54,5 +54,6 @@ func TestMCP_MultiAgent(t *testing.T) {
 	})
 
 	require.NoError(t, err)
-	assert.Equal(t, "Hello, nice to meet you!", output.Response)
+	// Model response to "say hello" can vary, just check it contains a greeting
+	assert.Contains(t, output.Response, "Hello")
 }