fix: improve sequential tool calling for non-streaming responses

claude[bot] · MervinPraison · web-flow · commit 9abd2a35dcd2 · 2025-07-11T18:24:18.000Z
- Fixed Ollama handling to properly separate streaming/non-streaming modes - Added robust error handling for response extraction - Fixed test file to use correct stream parameter - Ensures tool outputs are processed by LLM in non-streaming mode Addresses review feedback on PR #832 Co-authored-by: Mervin Praison <MervinPraison@users.noreply.github.com>
diff --git a/src/praisonai-agents/praisonaiagents/llm/llm.py b/src/praisonai-agents/praisonaiagents/llm/llm.py
@@ -864,32 +864,44 @@ def get_response(
                         ollama_params = self._handle_ollama_model(response_text, tool_results, messages, original_prompt)
                         
                         if ollama_params:
-                            # Get response with streaming
-                            if verbose:
-                                with Live(display_generating("", start_time), console=console, refresh_per_second=4) as live:
+                            # Get response based on streaming mode
+                            if stream:
+                                # Streaming approach
+                                if verbose:
+                                    with Live(display_generating("", start_time), console=console, refresh_per_second=4) as live:
+                                        response_text = ""
+                                        for chunk in litellm.completion(
+                                            **self._build_completion_params(
+                                                messages=ollama_params["follow_up_messages"],
+                                                temperature=temperature,
+                                                stream=True
+                                            )
+                                        ):
+                                            if chunk and chunk.choices and chunk.choices[0].delta.content:
+                                                content = chunk.choices[0].delta.content
+                                                response_text += content
+                                                live.update(display_generating(response_text, start_time))
+                                else:
                                     response_text = ""
                                     for chunk in litellm.completion(
                                         **self._build_completion_params(
                                             messages=ollama_params["follow_up_messages"],
                                             temperature=temperature,
-                                            stream=stream
+                                            stream=True
                                         )
                                     ):
                                         if chunk and chunk.choices and chunk.choices[0].delta.content:
-                                            content = chunk.choices[0].delta.content
-                                            response_text += content
-                                            live.update(display_generating(response_text, start_time))
+                                            response_text += chunk.choices[0].delta.content
                             else:
-                                response_text = ""
-                                for chunk in litellm.completion(
+                                # Non-streaming approach
+                                resp = litellm.completion(
                                     **self._build_completion_params(
                                         messages=ollama_params["follow_up_messages"],
                                         temperature=temperature,
-                                        stream=stream
+                                        stream=False
                                     )
-                                ):
-                                    if chunk and chunk.choices and chunk.choices[0].delta.content:
-                                        response_text += chunk.choices[0].delta.content
+                                )
+                                response_text = resp.get("choices", [{}])[0].get("message", {}).get("content", "") or ""
                             
                             # Set flag to indicate Ollama was handled
                             ollama_handled = True
@@ -988,7 +1000,7 @@ def get_response(
                                         **kwargs
                                     )
                                 )
-                                final_response_text = resp["choices"][0]["message"]["content"]
+                                final_response_text = resp.get("choices", [{}])[0].get("message", {}).get("content", "") or ""
                             
                             final_response_text = final_response_text.strip()
                         
diff --git a/test_sequential_tool_calling.py b/test_sequential_tool_calling.py
@@ -29,7 +29,7 @@ def multiply(a: int, b: int) -> int:
     self_reflect=False,
     verbose=True,
     tools=[get_stock_price, multiply],
-    llm_config={"stream": False}  # Force non-streaming mode
+    stream=False  # Force non-streaming mode - use stream parameter directly
 )
 
 result = agent.chat("Get the stock price of Google and multiply it by 2")

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ def multiply(a: int, b: int) -> int:`
`29`	`29`	`self_reflect=False,`
`30`	`30`	`verbose=True,`
`31`	`31`	`tools=[get_stock_price, multiply],`
`32`		`- llm_config={"stream": False} # Force non-streaming mode`
	`32`	`+ stream=False # Force non-streaming mode - use stream parameter directly`
`33`	`33`	`)`
`34`	`34`
`35`	`35`	`result = agent.chat("Get the stock price of Google and multiply it by 2")`