fix(vertex_and_google_ai_studio_gemini.py): bubble up thoughtsignature back to client

krrishdholakia · krrishdholakia · commit 51c73dc60ba2 · 2025-08-30T17:26:18.000-07:00
diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@@ -43,6 +43,7 @@
 from litellm.types.llms.openai import (
     AllMessageValues,
     ChatCompletionResponseMessage,
+    ChatCompletionThinkingBlock,
     ChatCompletionToolCallChunk,
     ChatCompletionToolCallFunctionChunk,
     ChatCompletionToolParamFunctionChunk,
@@ -792,7 +793,25 @@ def get_assistant_content_message(
                     content_str += _content_str
 
         return content_str, reasoning_content_str
-    
+
+    def _extract_thinking_blocks_from_parts(
+        self, parts: List[HttpxPartType]
+    ) -> List[ChatCompletionThinkingBlock]:
+        """Extract thinking blocks from parts if present"""
+        thinking_blocks: List[ChatCompletionThinkingBlock] = []
+        for part in parts:
+            if "thoughtSignature" in part:
+                part_copy = part.copy()
+                part_copy.pop("thoughtSignature")
+                thinking_blocks.append(
+                    ChatCompletionThinkingBlock(
+                        type="thinking",
+                        thinking=json.dumps(part_copy),
+                        signature=part["thoughtSignature"],
+                    )
+                )
+        return thinking_blocks
+
     def _extract_image_response_from_parts(
         self, parts: List[HttpxPartType]
     ) -> Optional[ImageURLObject]:
@@ -804,10 +823,7 @@ def _extract_image_response_from_parts(
                 if mime_type.startswith("image/"):
                     # Convert base64 data to data URI format
                     data_uri = f"data:{mime_type};base64,{data}"
-                    return ImageURLObject(
-                        url=data_uri,
-                        detail="auto"
-                    )
+                    return ImageURLObject(url=data_uri, detail="auto")
         return None
 
     def _extract_audio_response_from_parts(
@@ -1127,7 +1143,7 @@ def _calculate_web_search_requests(grounding_metadata: List[dict]) -> Optional[i
                 elif web_search_queries:
                     web_search_requests = len(grounding_metadata)
         return web_search_requests
-    
+
     @staticmethod
     def _create_streaming_choice(
         chat_completion_message: ChatCompletionResponseMessage,
@@ -1151,9 +1167,7 @@ def _create_streaming_choice(
             index=candidate.get("index", idx),
             delta=Delta(
                 content=chat_completion_message.get("content"),
-                reasoning_content=chat_completion_message.get(
-                    "reasoning_content"
-                ),
+                reasoning_content=chat_completion_message.get("reasoning_content"),
                 tool_calls=tools,
                 image=image_response,
                 function_call=functions,
@@ -1164,21 +1178,23 @@ def _create_streaming_choice(
         return choice
 
     @staticmethod
-    def _extract_candidate_metadata(candidate: Candidates) -> Tuple[List[dict], List[dict], List, List]:
+    def _extract_candidate_metadata(
+        candidate: Candidates,
+    ) -> Tuple[List[dict], List[dict], List, List]:
         """
         Extract metadata from a single candidate response.
-        
+
         Returns:
             grounding_metadata: List[dict]
-            url_context_metadata: List[dict] 
+            url_context_metadata: List[dict]
             safety_ratings: List
             citation_metadata: List
         """
         grounding_metadata: List[dict] = []
         url_context_metadata: List[dict] = []
         safety_ratings: List = []
         citation_metadata: List = []
-        
+
         if "groundingMetadata" in candidate:
             if isinstance(candidate["groundingMetadata"], list):
                 grounding_metadata.extend(candidate["groundingMetadata"])  # type: ignore
@@ -1194,8 +1210,13 @@ def _extract_candidate_metadata(candidate: Candidates) -> Tuple[List[dict], List
         if "urlContextMetadata" in candidate:
             # Add URL context metadata to grounding metadata
             url_context_metadata.append(cast(dict, candidate["urlContextMetadata"]))
-            
-        return grounding_metadata, url_context_metadata, safety_ratings, citation_metadata
+
+        return (
+            grounding_metadata,
+            url_context_metadata,
+            safety_ratings,
+            citation_metadata,
+        )
 
     @staticmethod
     def _process_candidates(
@@ -1227,6 +1248,7 @@ def _process_candidates(
         tools: Optional[List[ChatCompletionToolCallChunk]] = []
         functions: Optional[ChatCompletionToolCallFunctionChunk] = None
         cumulative_tool_call_index: int = 0
+        thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
 
         for idx, candidate in enumerate(_candidates):
             if "content" not in candidate:
@@ -1239,7 +1261,7 @@ def _process_candidates(
                 candidate_safety_ratings,
                 candidate_citation_metadata,
             ) = VertexGeminiConfig._extract_candidate_metadata(candidate)
-            
+
             grounding_metadata.extend(candidate_grounding_metadata)
             url_context_metadata.extend(candidate_url_context_metadata)
             safety_ratings.extend(candidate_safety_ratings)
@@ -1264,14 +1286,22 @@ def _process_candidates(
                     )
                 )
 
+                thinking_blocks = (
+                    VertexGeminiConfig()._extract_thinking_blocks_from_parts(
+                        parts=candidate["content"]["parts"]
+                    )
+                )
+
                 if audio_response is not None:
                     cast(Dict[str, Any], chat_completion_message)[
                         "audio"
                     ] = audio_response
                     chat_completion_message["content"] = None  # OpenAI spec
                 if image_response is not None:
                     # Handle image response - combine with text content into structured format
-                    cast(Dict[str, Any], chat_completion_message)["image"] = image_response
+                    cast(Dict[str, Any], chat_completion_message)[
+                        "image"
+                    ] = image_response
                 if content is not None:
                     chat_completion_message["content"] = content
 
@@ -1298,15 +1328,18 @@ def _process_candidates(
             if functions is not None:
                 chat_completion_message["function_call"] = functions
 
+            if thinking_blocks is not None:
+                chat_completion_message["thinking_blocks"] = thinking_blocks  # type: ignore
+
             if isinstance(model_response, ModelResponseStream):
                 choice = VertexGeminiConfig._create_streaming_choice(
                     chat_completion_message=chat_completion_message,
-                    candidate=candidate, 
-                    idx=idx, 
-                    tools=tools, 
-                    functions=functions, 
+                    candidate=candidate,
+                    idx=idx,
+                    tools=tools,
+                    functions=functions,
                     chat_completion_logprobs=chat_completion_logprobs,
-                    image_response=image_response
+                    image_response=image_response,
                 )
                 model_response.choices.append(choice)
             elif isinstance(model_response, ModelResponse):
diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py
@@ -43,10 +43,14 @@
 
 # Handle OpenAI SDK version compatibility for Text type
 try:
-    from openai.types.responses.response_create_params import Text as ResponseText
+    from openai.types.responses.response_create_params import (
+        Text as ResponseText,  # type: ignore
+    )
 except (ImportError, AttributeError):
     # Fall back to the concrete config type available in all SDK versions
-    from openai.types.responses.response_text_config_param import ResponseTextConfigParam as ResponseText
+    from openai.types.responses.response_text_config_param import (
+        ResponseTextConfigParam as ResponseText,
+    )
 
 from openai.types.responses.response_create_params import (
     Reasoning,
diff --git a/litellm/types/llms/vertex_ai.py b/litellm/types/llms/vertex_ai.py
@@ -72,6 +72,7 @@ class HttpxPartType(TypedDict, total=False):
     executableCode: HttpxExecutableCode
     codeExecutionResult: HttpxCodeExecutionResult
     thought: bool
+    thoughtSignature: str
 
 
 class HttpxContentType(TypedDict, total=False):
@@ -245,10 +246,11 @@ class UsageMetadata(TypedDict, total=False):
 class TokenCountDetailsResponse(TypedDict):
     """
     Response structure for token count details with modality breakdown.
-    
+
     Example:
         {'totalTokens': 12, 'promptTokensDetails': [{'modality': 'TEXT', 'tokenCount': 12}]}
     """
+
     totalTokens: int
     promptTokensDetails: List[PromptTokensDetails]
 
diff --git a/tests/llm_translation/test_gemini.py b/tests/llm_translation/test_gemini.py
@@ -436,7 +436,10 @@ def test_gemini_with_empty_function_call_arguments():
 async def test_claude_tool_use_with_gemini():
     response = await litellm.anthropic.messages.acreate(
         messages=[
-            {"role": "user", "content": "Hello, can you tell me the weather in Boston. Please respond with a tool call?"}
+            {
+                "role": "user",
+                "content": "Hello, can you tell me the weather in Boston. Please respond with a tool call?",
+            }
         ],
         model="gemini/gemini-2.5-flash",
         stream=True,
@@ -578,11 +581,17 @@ def test_gemini_tool_use():
     assert stop_reason is not None
     assert stop_reason == "tool_calls"
 
+
 @pytest.mark.asyncio
 async def test_gemini_image_generation_async():
     litellm._turn_on_debug()
     response = await litellm.acompletion(
-        messages=[{"role": "user", "content": "Generate an image of a banana wearing a costume that says LiteLLM"}],
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate an image of a banana wearing a costume that says LiteLLM",
+            }
+        ],
         model="gemini/gemini-2.5-flash-image-preview",
     )
 
@@ -597,12 +606,16 @@ async def test_gemini_image_generation_async():
     assert IMAGE_URL["url"].startswith("data:image/png;base64,")
 
 
-
 @pytest.mark.asyncio
 async def test_gemini_image_generation_async_stream():
-    #litellm._turn_on_debug()
+    # litellm._turn_on_debug()
     response = await litellm.acompletion(
-        messages=[{"role": "user", "content": "Generate an image of a banana wearing a costume that says LiteLLM"}],
+        messages=[
+            {
+                "role": "user",
+                "content": "Generate an image of a banana wearing a costume that says LiteLLM",
+            }
+        ],
         model="gemini/gemini-2.5-flash-image-preview",
         stream=True,
     )
@@ -611,35 +624,72 @@ async def test_gemini_image_generation_async_stream():
     model_response_image = None
     async for chunk in response:
         print("CHUNK: ", chunk)
-        if hasattr(chunk.choices[0].delta, "image") and chunk.choices[0].delta.image is not None:
+        if (
+            hasattr(chunk.choices[0].delta, "image")
+            and chunk.choices[0].delta.image is not None
+        ):
             model_response_image = chunk.choices[0].delta.image
             print("MODEL_RESPONSE_IMAGE: ", model_response_image)
             assert model_response_image is not None
             assert model_response_image["url"].startswith("data:image/png;base64,")
             break
-    
+
     #########################################################
     # Important: Validate we did get an image in the response
     #########################################################
     assert model_response_image is not None
     assert model_response_image["url"].startswith("data:image/png;base64,")
-    
+
 
 def test_system_message_with_no_user_message():
-        """
-        Test that the system message is translated correctly for non-OpenAI providers.
-        """
-        messages = [
-            {
-                "role": "system",
-                "content": "Be a good bot!",
-            },
-        ]
+    """
+    Test that the system message is translated correctly for non-OpenAI providers.
+    """
+    messages = [
+        {
+            "role": "system",
+            "content": "Be a good bot!",
+        },
+    ]
 
-        response = litellm.completion(
-            model="gemini/gemini-2.5-flash",
-            messages=messages,
-        )
-        assert response is not None
+    response = litellm.completion(
+        model="gemini/gemini-2.5-flash",
+        messages=messages,
+    )
+    assert response is not None
 
-        assert response.choices[0].message.content is not None
+    assert response.choices[0].message.content is not None
+
+
+def test_gemini_with_thinking():
+    from litellm import completion
+
+    litellm._turn_on_debug()
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location"],
+                },
+            },
+        }
+    ]
+    messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+
+    result = completion(
+        model="gemini/gemini-2.5-flash",
+        messages=messages,
+        tools=tools,
+    )
+    print(f"result: {result}")