fix: allow support for include_usage in streaming using OpenAIChatGenerator (#8968)

Amnah199 · web-flow · commit 13c3768d49a2 · 2025-03-05T18:30:26.000+01:00
* fix error in handling usage completion chunk
diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py
@@ -399,29 +399,39 @@ def _prepare_api_call(  # noqa: PLR0913
     def _handle_stream_response(self, chat_completion: Stream, callback: SyncStreamingCallbackT) -> List[ChatMessage]:
         chunks: List[StreamingChunk] = []
         chunk = None
+        chunk_delta: StreamingChunk
 
         for chunk in chat_completion:  # pylint: disable=not-an-iterable
-            assert len(chunk.choices) == 1, "Streaming responses should have only one choice."
-            chunk_delta: StreamingChunk = self._convert_chat_completion_chunk_to_streaming_chunk(chunk)
+            # choices is an empty array for usage_chunk when include_usage is set to True
+            if chunk.usage is not None:
+                chunk_delta = self._convert_usage_chunk_to_streaming_chunk(chunk)
+
+            else:
+                assert len(chunk.choices) == 1, "Streaming responses should have only one choice."
+                chunk_delta = self._convert_chat_completion_chunk_to_streaming_chunk(chunk)
             chunks.append(chunk_delta)
 
             callback(chunk_delta)
-
         return [self._convert_streaming_chunks_to_chat_message(chunk, chunks)]
 
     async def _handle_async_stream_response(
         self, chat_completion: AsyncStream, callback: AsyncStreamingCallbackT
     ) -> List[ChatMessage]:
         chunks: List[StreamingChunk] = []
         chunk = None
+        chunk_delta: StreamingChunk
 
         async for chunk in chat_completion:  # pylint: disable=not-an-iterable
-            assert len(chunk.choices) == 1, "Streaming responses should have only one choice."
-            chunk_delta: StreamingChunk = self._convert_chat_completion_chunk_to_streaming_chunk(chunk)
+            # choices is an empty array for usage_chunk when include_usage is set to True
+            if chunk.usage is not None:
+                chunk_delta = self._convert_usage_chunk_to_streaming_chunk(chunk)
+
+            else:
+                assert len(chunk.choices) == 1, "Streaming responses should have only one choice."
+                chunk_delta = self._convert_chat_completion_chunk_to_streaming_chunk(chunk)
             chunks.append(chunk_delta)
 
             await callback(chunk_delta)
-
         return [self._convert_streaming_chunks_to_chat_message(chunk, chunks)]
 
     def _check_finish_reason(self, meta: Dict[str, Any]) -> None:
@@ -447,6 +457,8 @@ def _convert_streaming_chunks_to_chat_message(
 
         :param chunk: The last chunk returned by the OpenAI API.
         :param chunks: The list of all `StreamingChunk` objects.
+
+        :returns: The ChatMessage.
         """
         text = "".join([chunk.content for chunk in chunks])
         tool_calls = []
@@ -486,12 +498,15 @@ def _convert_streaming_chunks_to_chat_message(
                     _arguments=call_data["arguments"],
                 )
 
+        # finish_reason is in the last chunk if usage is not included, and in the second last chunk if usage is included
+        finish_reason = (chunks[-2] if chunk.usage and len(chunks) >= 2 else chunks[-1]).meta.get("finish_reason")
+
         meta = {
             "model": chunk.model,
             "index": 0,
-            "finish_reason": chunk.choices[0].finish_reason,
+            "finish_reason": finish_reason,
             "completion_start_time": chunks[0].meta.get("received_at"),  # first chunk received
-            "usage": {},  # we don't have usage data for streaming responses
+            "usage": chunk.usage or {},
         }
 
         return ChatMessage.from_assistant(text=text or None, tool_calls=tool_calls, meta=meta)
@@ -559,3 +574,18 @@ def _convert_chat_completion_chunk_to_streaming_chunk(self, chunk: ChatCompletio
             }
         )
         return chunk_message
+
+    def _convert_usage_chunk_to_streaming_chunk(self, chunk: ChatCompletionChunk) -> StreamingChunk:
+        """
+        Converts the usage chunk received from the OpenAI API when `include_usage` is set to `True` to a StreamingChunk.
+
+        :param chunk: The usage chunk returned by the OpenAI API.
+
+        :returns:
+            The StreamingChunk.
+        """
+        chunk_message = StreamingChunk(content="")
+        chunk_message.meta.update(
+            {"model": chunk.model, "usage": chunk.usage, "received_at": datetime.now().isoformat()}
+        )
+        return chunk_message
diff --git a/test/components/generators/chat/test_openai.py b/test/components/generators/chat/test_openai.py
@@ -9,7 +9,7 @@
 import os
 from datetime import datetime
 
-from openai import AsyncOpenAI, OpenAIError
+from openai import OpenAIError
 from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionMessage, ChatCompletionMessageToolCall
 from openai.types.chat.chat_completion import Choice
 from openai.types.chat.chat_completion_message_tool_call import Function
@@ -63,7 +63,6 @@ def mock_chat_completion_chunk_with_tools(openai_mock_stream):
                 )
             ],
             created=int(datetime.now().timestamp()),
-            usage={"prompt_tokens": 57, "completion_tokens": 40, "total_tokens": 97},
         )
         mock_chat_completion_create.return_value = openai_mock_stream(
             completion, cast_to=None, response=None, client=None
diff --git a/test/components/generators/chat/test_openai_async.py b/test/components/generators/chat/test_openai_async.py
@@ -62,7 +62,7 @@ def mock_chat_completion_chunk_with_tools(openai_mock_stream_async):
                 )
             ],
             created=int(datetime.now().timestamp()),
-            usage={"prompt_tokens": 57, "completion_tokens": 40, "total_tokens": 97},
+            usage=None,
         )
         mock_chat_completion_create.return_value = openai_mock_stream_async(completion)
         yield mock_chat_completion_create
diff --git a/test/components/generators/conftest.py b/test/components/generators/conftest.py
@@ -7,9 +7,8 @@
 
 import pytest
 from openai import AsyncStream, Stream
-from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionMessage
-from openai.types.chat.chat_completion_chunk import Choice, ChoiceDelta
-from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionMessage, ChatCompletionMessageToolCall
+from openai.types.chat import ChatCompletion, ChatCompletionChunk
+from openai.types.chat import ChatCompletion, ChatCompletionChunk
 from openai.types.chat import chat_completion_chunk
 
 
@@ -146,7 +145,7 @@ def openai_mock_chat_completion_chunk():
                 )
             ],
             created=int(datetime.now().timestamp()),
-            usage={"prompt_tokens": 57, "completion_tokens": 40, "total_tokens": 97},
+            usage=None,
         )
         mock_chat_completion_create.return_value = OpenAIMockStream(
             completion, cast_to=None, response=None, client=None
@@ -175,7 +174,7 @@ async def openai_mock_async_chat_completion_chunk():
                 )
             ],
             created=int(datetime.now().timestamp()),
-            usage={"prompt_tokens": 57, "completion_tokens": 40, "total_tokens": 97},
+            usage=None,
         )
         mock_chat_completion_create.return_value = OpenAIAsyncMockStream(completion)
         yield mock_chat_completion_create

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ def mock_chat_completion_chunk_with_tools(openai_mock_stream_async):`
`62`	`62`	`)`
`63`	`63`	`],`
`64`	`64`	`created=int(datetime.now().timestamp()),`
`65`		`- usage={"prompt_tokens": 57, "completion_tokens": 40, "total_tokens": 97},`
	`65`	`+ usage=None,`
`66`	`66`	`)`
`67`	`67`	`mock_chat_completion_create.return_value = openai_mock_stream_async(completion)`
`68`	`68`	`yield mock_chat_completion_create`