Replies: 1 comment
-
Hey @No41Name! I'm here to help you with any bugs, questions, or contributions you might have. Let's tackle this together! The difference in results between
This difference in modes can lead to variations in the results, especially if the response generation involves multiple steps or function calls. Additionally,
Supporting code snippets: @trace_method("chat")
def chat(
self,
message: str,
chat_history: Optional[List[ChatMessage]] = None,
tool_choice: Union[str, dict] = "auto",
) -> AgentChatResponse:
if chat_history is not None:
self._memory.set(chat_history)
self._memory.put(ChatMessage(content=message, role="user"))
initial_token_count = len(
self._memory.tokenizer_fn(
" ".join([(m.content or "") for m in self._prefix_messages])
)
)
all_messages = self._prefix_messages + self._memory.get(
initial_token_count=initial_token_count
)
chat_response = self._llm.chat(all_messages)
ai_message = chat_response.message
self._memory.put(ai_message)
return AgentChatResponse(response=str(chat_response.message.content))
@trace_method("chat")
def stream_chat(
self,
message: str,
chat_history: Optional[List[ChatMessage]] = None,
tool_choice: Union[str, dict] = "auto",
) -> StreamingAgentChatResponse:
if chat_history is not None:
self._memory.set(chat_history)
self._memory.put(ChatMessage(content=message, role="user"))
initial_token_count = len(
self._memory.tokenizer_fn(
" ".join([(m.content or "") for m in self._prefix_messages])
)
)
all_messages = self._prefix_messages + self._memory.get(
initial_token_count=initial_token_count
)
chat_response = StreamingAgentChatResponse(
chat_stream=self._llm.stream_chat(all_messages)
)
thread = Thread(
target=chat_response.write_response_to_history, args=(self._memory,)
)
thread.start()
return chat_response Additionally, the @trace_method("chat")
async def achat(
self, message: str, chat_history: Optional[List[ChatMessage]] = None
) -> AgentChatResponse:
if chat_history is not None:
self._memory.set(chat_history)
self._memory.put(ChatMessage(content=message, role="user"))
initial_token_count = len(
self._memory.tokenizer_fn(
" ".join([(m.content or "") for m in self._prefix_messages])
)
)
all_messages = self._prefix_messages + self._memory.get(
initial_token_count=initial_token_count
)
chat_response = await self._llm.achat(all_messages)
ai_message = chat_response.message
self._memory.put(ai_message)
return AgentChatResponse(response=str(chat_response.message.content))
@trace_method("chat")
async def astream_chat(
self, message: str, chat_history: Optional[List[ChatMessage]] = None
) -> StreamingAgentChatResponse:
if chat_history is not None:
self._memory.set(chat_history)
self._memory.put(ChatMessage(content=message, role="user"))
initial_token_count = len(
self._memory.tokenizer_fn(
" ".join([(m.content or "") for m in the _prefix_messages])
)
)
all_messages = self._prefix_messages + self._memory.get(
initial_token_count=initial_token_count
)
chat_response = StreamingAgentChatResponse(
achat_stream=await self._llm.astream_chat(all_messages)
)
asyncio.create_task(chat_response.awrite_response_to_history(self._memory))
return chat_response In summary, the different modes (
|
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
-
Hi,
While implementing a RAG with chat_engine I noticed that the
chat()
and.stream_chat()
methods are returning different results.I thought that the second one was just providing the same response, but in a generator in order to allow for live printing.
Are there additional differences?
Here is an example of what I'm talking about, a simple RAG implementation with 2 documents (Alice in Wonderland and Shakespeare). You can notice that the answers are really different. Same configuration and temperature of the LLM set to zero.
Beta Was this translation helpful? Give feedback.
All reactions