feat(anthropic): enable interleaved thinking for agentic workflows

FammasMaz · FammasMaz · commit 53e1facf8e00 · 2025-12-24T11:18:02.000+01:00
Add support for Claude to think between tool calls in agentic sessions:
- Add anthropic-beta header (interleaved-thinking-2025-05-14) for Claude models
- Track thinking block transitions in streaming accumulator
- Handle standalone signature deltas in streaming pipeline
- Close text blocks before starting new thinking blocks (enables interleaving)
- Log interleaved thinking summary at stream completion

Enables Claude Code to display thinking between tool executions.
diff --git a/src/proxy_app/main.py b/src/proxy_app/main.py
@@ -1015,6 +1015,13 @@ async def anthropic_messages(
     # Initialize logger if enabled
     logger = DetailedLogger() if ENABLE_REQUEST_LOGGING else None
 
+    # Log if client is requesting interleaved thinking (informational)
+    anthropic_beta = request.headers.get("anthropic-beta", "")
+    if "interleaved-thinking" in anthropic_beta:
+        logging.getLogger("rotator_library").debug(
+            f"[Anthropic API] Client requested interleaved thinking: {anthropic_beta}"
+        )
+
     try:
         # Log the request to console
         log_request_to_console(
diff --git a/src/rotator_library/anthropic_compat/streaming.py b/src/rotator_library/anthropic_compat/streaming.py
@@ -163,11 +163,26 @@ async def anthropic_streaming_wrapper(
             # Always capture signature if available (may come in later deltas)
             if thought_sig_from_delta and not thinking_signature:
                 thinking_signature = thought_sig_from_delta
+                logger.debug(
+                    f"[SIGNATURE] Streaming wrapper captured signature: "
+                    f"{thought_sig_from_delta[:50]}..."
+                )
 
             if reasoning_content:
-                import logging
-                logging.getLogger("rotator_library").debug(
-                    f"[Anthropic Stream] Sending thinking ({len(reasoning_content)} chars), sig={bool(thinking_signature)}"
+                # Enhanced DEBUG logging for interleaved thinking verification
+                context = "initial" if current_block_index == 0 else "interleaved"
+                prev_block = "none"
+                if thinking_block_started:
+                    prev_block = "thinking"
+                elif content_block_started:
+                    prev_block = "text"
+                elif tool_calls_by_index:
+                    prev_block = "tool_use"
+
+                logger.debug(
+                    f"[INTERLEAVED] Anthropic stream: block #{current_block_index}, "
+                    f"context={context}, prev={prev_block}, "
+                    f"chars={len(reasoning_content)}, sig={bool(thinking_signature)}"
                 )
                 if not thinking_block_started:
                     # Close any open text block before starting a new thinking block
@@ -199,6 +214,10 @@ async def anthropic_streaming_wrapper(
             if content:
                 # If we were in a thinking block, close it first
                 if thinking_block_started and not content_block_started:
+                    logger.debug(
+                        f"[INTERLEAVED] Closing thinking block (idx={current_block_index}) for text, "
+                        f"has_sig={bool(thinking_signature)}"
+                    )
                     # Send signature_delta if we have a signature
                     if thinking_signature:
                         sig_delta = {
@@ -238,6 +257,9 @@ async def anthropic_streaming_wrapper(
                 if tc_index not in tool_calls_by_index:
                     # Close previous thinking block if open
                     if thinking_block_started:
+                        logger.debug(
+                            f"[INTERLEAVED] Closing thinking block (idx={current_block_index}) for tool_use"
+                        )
                         # Send signature_delta if we have a signature
                         if thinking_signature:
                             sig_delta = {
diff --git a/src/rotator_library/providers/antigravity_provider.py b/src/rotator_library/providers/antigravity_provider.py
@@ -3741,20 +3741,36 @@ def _gemini_to_openai_chunk(
             if has_sig and is_thought and accumulator is not None:
                 accumulator["thought_signature"] = part["thoughtSignature"]
 
-            # Skip standalone signature parts
+            # Handle standalone signature parts - send them as a delta with just the signature
+            # This is critical for Claude Code to receive the signature for thinking blocks
             if has_sig and not has_func and (not has_text or not part.get("text")):
-                continue
+                # Don't skip! Instead, we'll send a minimal delta with the signature below
+                # The signature will be included via the accumulator
+                pass  # Continue to build delta below
 
             if has_text:
                 text = part["text"]
                 if is_thought:
                     reasoning_content += text
                     if accumulator is not None:
                         accumulator["reasoning_content"] += text
+                        # Track interleaved thinking for DEBUG logging
+                        last_type = accumulator.get("last_content_type")
+                        if last_type and last_type != "thinking":
+                            accumulator["thinking_block_count"] = (
+                                accumulator.get("thinking_block_count", 0) + 1
+                            )
+                            lib_logger.debug(
+                                f"[INTERLEAVED] Thinking block "
+                                f"#{accumulator['thinking_block_count']}: "
+                                f"after={last_type}, chars={len(text)}"
+                            )
+                        accumulator["last_content_type"] = "thinking"
                 else:
                     text_content += text
                     if accumulator is not None:
                         accumulator["text_content"] += text
+                        accumulator["last_content_type"] = "text"
 
             if has_func:
                 # Get tool_schemas from accumulator for schema-aware parsing
@@ -3769,6 +3785,9 @@ def _gemini_to_openai_chunk(
 
                 tool_calls.append(tool_call)
                 tool_idx += 1
+                # Track tool call for interleaved thinking detection
+                if accumulator is not None:
+                    accumulator["last_content_type"] = "tool_call"
 
         # Build delta
         delta = {}
@@ -3777,8 +3796,19 @@ def _gemini_to_openai_chunk(
         if reasoning_content:
             delta["reasoning_content"] = reasoning_content
             # Include thought_signature if available (from accumulator)
+            # The signature arrives at the END of thinking, so we include it
+            # with EVERY reasoning delta once captured - streaming wrapper
+            # will capture it and use it when closing the thinking block
             if accumulator and accumulator.get("thought_signature"):
                 delta["thought_signature"] = accumulator["thought_signature"]
+        # Send signature-only delta when signature arrives without content
+        # This ensures the streaming wrapper receives the signature
+        elif accumulator and accumulator.get("thought_signature"):
+            # Check if we just captured a new signature (standalone signature part)
+            sig = accumulator.get("thought_signature")
+            if sig and not text_content and not tool_calls:
+                delta["thought_signature"] = sig
+                delta["role"] = "assistant"
         if tool_calls:
             delta["tool_calls"] = tool_calls
             delta["role"] = "assistant"
@@ -4284,6 +4314,20 @@ async def acompletion(
             **ANTIGRAVITY_HEADERS,
         }
 
+        # Add interleaved thinking header for Claude thinking models
+        # This enables thinking between tool calls for agentic workflows
+        if self._is_claude(model) and reasoning_effort and reasoning_effort != "disable":
+            interleaved_header = "interleaved-thinking-2025-05-14"
+            existing_beta = headers.get("anthropic-beta", "")
+            if existing_beta:
+                if interleaved_header not in existing_beta:
+                    headers["anthropic-beta"] = f"{existing_beta},{interleaved_header}"
+            else:
+                headers["anthropic-beta"] = interleaved_header
+            lib_logger.debug(
+                f"[Antigravity] Added interleaved thinking header for {model}"
+            )
+
         # Track malformed call retries (separate from empty response retries)
         malformed_retry_count = 0
         # Keep a mutable reference to gemini_contents for retry injection
@@ -4620,6 +4664,9 @@ async def _handle_streaming(
             "tool_schemas": tool_schemas,  # For schema-aware JSON string parsing
             "malformed_call": None,  # Track MALFORMED_FUNCTION_CALL if detected
             "response_id": None,  # Track original response ID for synthetic chunks
+            # Interleaved thinking tracking for DEBUG logging
+            "thinking_block_count": 0,  # Count of thinking block transitions
+            "last_content_type": None,  # Track: "thinking", "text", "tool_call"
         }
 
         async with client.stream(
@@ -4706,6 +4753,14 @@ async def _handle_streaming(
                     final_chunk["usage"] = accumulator["last_usage"]
                 yield litellm.ModelResponse(**final_chunk)
 
+            # Log interleaved thinking summary at stream completion
+            thinking_block_count = accumulator.get("thinking_block_count", 0)
+            if thinking_block_count > 0:
+                lib_logger.info(
+                    f"[Antigravity] Stream completed with {thinking_block_count} "
+                    f"interleaved thinking block(s) for {model}"
+                )
+
             # Cache Claude thinking after stream completes
             if (
                 self._is_claude(model)