strands-agents
diff --git a/‎src/strands/models/litellm.py‎
Lines changed: 185 additions & 115 deletions b/‎src/strands/models/litellm.py‎
Lines changed: 185 additions & 115 deletions
@@ -280,121 +280,16 @@ async def stream(
 
         logger.debug("invoking model with stream=%s", litellm_request.get("stream"))
 
-        if not is_streaming:
-            response = await litellm.acompletion(**self.client_args, **litellm_request)
-
-            logger.debug("got non-streaming response from model")
-            yield self.format_chunk({"chunk_type": "message_start"})
-            yield self.format_chunk({"chunk_type": "content_start", "data_type": "text"})
-
-            tool_calls: dict[int, list[Any]] = {}
-            finish_reason = None
-
-            if hasattr(response, "choices") and response.choices and len(response.choices) > 0:
-                choice = response.choices[0]
-
-                if hasattr(choice, "message") and choice.message:
-                    if hasattr(choice.message, "content") and choice.message.content:
-                        yield self.format_chunk(
-                            {"chunk_type": "content_delta", "data_type": "text", "data": choice.message.content}
-                        )
-
-                    if hasattr(choice.message, "reasoning_content") and choice.message.reasoning_content:
-                        yield self.format_chunk(
-                            {
-                                "chunk_type": "content_delta",
-                                "data_type": "reasoning_content",
-                                "data": choice.message.reasoning_content,
-                            }
-                        )
-
-                    if hasattr(choice.message, "tool_calls") and choice.message.tool_calls:
-                        for i, tool_call in enumerate(choice.message.tool_calls):
-                            tool_calls.setdefault(i, []).append(tool_call)
-
-                if hasattr(choice, "finish_reason"):
-                    finish_reason = choice.finish_reason
-
-            yield self.format_chunk({"chunk_type": "content_stop", "data_type": "text"})
-
-            for tool_deltas in tool_calls.values():
-                yield self.format_chunk({"chunk_type": "content_start", "data_type": "tool", "data": tool_deltas[0]})
-
-                for tool_delta in tool_deltas:
-                    yield self.format_chunk({"chunk_type": "content_delta", "data_type": "tool", "data": tool_delta})
-
-                yield self.format_chunk({"chunk_type": "content_stop", "data_type": "tool"})
-
-            yield self.format_chunk({"chunk_type": "message_stop", "data": finish_reason})
-
-            # Add usage information if available
-            if hasattr(response, "usage"):
-                yield self.format_chunk({"chunk_type": "metadata", "data": response.usage})
-        else:
-            # For streaming, use the streaming API
-            response = await litellm.acompletion(**self.client_args, **litellm_request)
-
-            logger.debug("got streaming response from model")
-            yield self.format_chunk({"chunk_type": "message_start"})
-            yield self.format_chunk({"chunk_type": "content_start", "data_type": "text"})
-
-            streaming_tool_calls: dict[int, list[Any]] = {}
-            finish_reason = None
-
-            try:
-                async for event in response:
-                    # Defensive: skip events with empty or missing choices
-                    if not getattr(event, "choices", None):
-                        continue
-                    choice = event.choices[0]
-
-                    if choice.delta.content:
-                        yield self.format_chunk(
-                            {"chunk_type": "content_delta", "data_type": "text", "data": choice.delta.content}
-                        )
-
-                    if hasattr(choice.delta, "reasoning_content") and choice.delta.reasoning_content:
-                        yield self.format_chunk(
-                            {
-                                "chunk_type": "content_delta",
-                                "data_type": "reasoning_content",
-                                "data": choice.delta.reasoning_content,
-                            }
-                        )
-
-                    for tool_call in choice.delta.tool_calls or []:
-                        streaming_tool_calls.setdefault(tool_call.index, []).append(tool_call)
-
-                    if choice.finish_reason:
-                        finish_reason = choice.finish_reason
-                        break
-            except Exception as e:
-                logger.warning("Error processing streaming response: %s", e)
-
-            yield self.format_chunk({"chunk_type": "content_stop", "data_type": "text"})
-
-            # Process tool calls
-            for tool_deltas in streaming_tool_calls.values():
-                yield self.format_chunk({"chunk_type": "content_start", "data_type": "tool", "data": tool_deltas[0]})
-
-                for tool_delta in tool_deltas:
-                    yield self.format_chunk({"chunk_type": "content_delta", "data_type": "tool", "data": tool_delta})
-
-                yield self.format_chunk({"chunk_type": "content_stop", "data_type": "tool"})
-
-            yield self.format_chunk({"chunk_type": "message_stop", "data": finish_reason})
-
-            try:
-                last_event = None
-                async for event in response:
-                    last_event = event
-
-                # Use the last event for usage information
-                if last_event and hasattr(last_event, "usage"):
-                    yield self.format_chunk({"chunk_type": "metadata", "data": last_event.usage})
-            except Exception:
-                # If there's an error collecting remaining events, just continue
-                pass
+        try:
+            if is_streaming:
+                async for chunk in self._handle_streaming_response(litellm_request):
+                    yield chunk
+            else:
+                async for chunk in self._handle_non_streaming_response(litellm_request):
+                    yield chunk
+        except ContextWindowExceededError as e:
+            logger.warning("litellm client raised context window overflow")
+            raise ContextWindowOverflowException(e) from e
 
         logger.debug("finished processing response from model")
 
@@ -481,6 +376,181 @@ async def _structured_output_using_tool(
         except (json.JSONDecodeError, TypeError, ValueError) as e:
             raise ValueError(f"Failed to parse or load content into model: {e}") from e
 
+    async def _process_choice_content(
+        self, choice: Any, data_type: str | None, tool_calls: dict[int, list[Any]], is_streaming: bool = True
+    ) -> AsyncGenerator[tuple[str | None, StreamEvent], None]:
+        """Process content from a choice object (streaming or non-streaming).
+
+        Args:
+            choice: The choice object from the response.
+            data_type: Current data type being processed.
+            tool_calls: Dictionary to collect tool calls.
+            is_streaming: Whether this is from a streaming response.
+
+        Yields:
+            Tuples of (updated_data_type, stream_event).
+        """
+        # Get the content source - this is the only difference between streaming/non-streaming
+        # We use duck typing here: both choice.delta and choice.message have the same interface
+        # (reasoning_content, content, tool_calls attributes) but different object structures
+        content_source = choice.delta if is_streaming else choice.message
+
+        # Process reasoning content
+        if hasattr(content_source, "reasoning_content") and content_source.reasoning_content:
+            chunks, data_type = self._stream_switch_content("reasoning_content", data_type)
+            for chunk in chunks:
+                yield data_type, chunk
+            chunk = self.format_chunk(
+                {
+                    "chunk_type": "content_delta",
+                    "data_type": "reasoning_content",
+                    "data": content_source.reasoning_content,
+                }
+            )
+            yield data_type, chunk
+
+        # Process text content
+        if hasattr(content_source, "content") and content_source.content:
+            chunks, data_type = self._stream_switch_content("text", data_type)
+            for chunk in chunks:
+                yield data_type, chunk
+            chunk = self.format_chunk(
+                {
+                    "chunk_type": "content_delta",
+                    "data_type": "text",
+                    "data": content_source.content,
+                }
+            )
+            yield data_type, chunk
+
+        # Process tool calls
+        if hasattr(content_source, "tool_calls") and content_source.tool_calls:
+            if is_streaming:
+                # Streaming: tool calls have index attribute for out-of-order delivery
+                for tool_call in content_source.tool_calls:
+                    tool_calls.setdefault(tool_call.index, []).append(tool_call)
+            else:
+                # Non-streaming: tool calls arrive in order, use enumerated index
+                for i, tool_call in enumerate(content_source.tool_calls):
+                    tool_calls.setdefault(i, []).append(tool_call)
+
+    async def _process_tool_calls(self, tool_calls: dict[int, list[Any]]) -> AsyncGenerator[StreamEvent, None]:
+        """Process and yield tool call events.
+
+        Args:
+            tool_calls: Dictionary of tool calls indexed by their position.
+
+        Yields:
+            Formatted tool call chunks.
+        """
+        for tool_deltas in tool_calls.values():
+            yield self.format_chunk({"chunk_type": "content_start", "data_type": "tool", "data": tool_deltas[0]})
+
+            for tool_delta in tool_deltas:
+                yield self.format_chunk({"chunk_type": "content_delta", "data_type": "tool", "data": tool_delta})
+
+            yield self.format_chunk({"chunk_type": "content_stop", "data_type": "tool"})
+
+    async def _handle_non_streaming_response(
+        self, litellm_request: dict[str, Any]
+    ) -> AsyncGenerator[StreamEvent, None]:
+        """Handle non-streaming response from LiteLLM.
+
+        Args:
+            litellm_request: The formatted request for LiteLLM.
+
+        Yields:
+            Formatted message chunks from the model.
+        """
+        response = await litellm.acompletion(**self.client_args, **litellm_request)
+
+        logger.debug("got non-streaming response from model")
+        yield self.format_chunk({"chunk_type": "message_start"})
+
+        tool_calls: dict[int, list[Any]] = {}
+        data_type: str | None = None
+        finish_reason: str | None = None
+
+        if hasattr(response, "choices") and response.choices and len(response.choices) > 0:
+            choice = response.choices[0]
+
+            if hasattr(choice, "message") and choice.message:
+                # Process content using shared logic
+                async for updated_data_type, chunk in self._process_choice_content(
+                    choice, data_type, tool_calls, is_streaming=False
+                ):
+                    data_type = updated_data_type
+                    yield chunk
+
+            if hasattr(choice, "finish_reason"):
+                finish_reason = choice.finish_reason
+
+        # Stop the current content block if we have one
+        if data_type:
+            yield self.format_chunk({"chunk_type": "content_stop", "data_type": data_type})
+
+        # Process tool calls
+        async for chunk in self._process_tool_calls(tool_calls):
+            yield chunk
+
+        yield self.format_chunk({"chunk_type": "message_stop", "data": finish_reason})
+
+        # Add usage information if available
+        if hasattr(response, "usage"):
+            yield self.format_chunk({"chunk_type": "metadata", "data": response.usage})
+
+    async def _handle_streaming_response(self, litellm_request: dict[str, Any]) -> AsyncGenerator[StreamEvent, None]:
+        """Handle streaming response from LiteLLM.
+
+        Args:
+            litellm_request: The formatted request for LiteLLM.
+
+        Yields:
+            Formatted message chunks from the model.
+        """
+        # For streaming, use the streaming API
+        response = await litellm.acompletion(**self.client_args, **litellm_request)
+
+        logger.debug("got response from model")
+        yield self.format_chunk({"chunk_type": "message_start"})
+
+        tool_calls: dict[int, list[Any]] = {}
+        data_type: str | None = None
+        finish_reason: str | None = None
+
+        async for event in response:
+            # Defensive: skip events with empty or missing choices
+            if not getattr(event, "choices", None):
+                continue
+            choice = event.choices[0]
+
+            # Process content using shared logic
+            async for updated_data_type, chunk in self._process_choice_content(
+                choice, data_type, tool_calls, is_streaming=True
+            ):
+                data_type = updated_data_type
+                yield chunk
+
+            if choice.finish_reason:
+                finish_reason = choice.finish_reason
+                if data_type:
+                    yield self.format_chunk({"chunk_type": "content_stop", "data_type": data_type})
+                break
+
+        # Process tool calls
+        async for chunk in self._process_tool_calls(tool_calls):
+            yield chunk
+
+        yield self.format_chunk({"chunk_type": "message_stop", "data": finish_reason})
+
+        # Skip remaining events as we don't have use for anything except the final usage payload
+        async for event in response:
+            _ = event
+            if event.usage:
+                yield self.format_chunk({"chunk_type": "metadata", "data": event.usage})
+
+        logger.debug("finished streaming response from model")
+
     def _apply_proxy_prefix(self) -> None:
         """Apply litellm_proxy/ prefix to model_id when use_litellm_proxy is True.