feat(ollama/completion): output parse thinking content on streaming + non-streaming for ollama completion calls

krrishdholakia · krrishdholakia · commit 2ad77d9bf608 · 2025-08-31T21:17:10.000-07:00
Completes 'thinking' param support for ollama
diff --git a/litellm/llms/ollama/chat/transformation.py b/litellm/llms/ollama/chat/transformation.py
@@ -504,34 +504,23 @@ def chunk_parser(self, chunk: dict) -> ModelResponseStream:
                     reasoning_content = chunk["message"].get("thinking")
                     self.finished_reasoning_content = True
             elif chunk["message"].get("content") is not None:
-                if "<think>" in chunk["message"].get("content"):
-                    reasoning_content = (
-                        chunk["message"].get("content").replace("<think>", "")
-                    )
+                message_content = chunk["message"].get("content")
+                if "<think>" in message_content:
+                    message_content = message_content.replace("<think>", "")
 
                     self.started_reasoning_content = True
 
-                if (
-                    "</think>" in chunk["message"].get("content")
-                    and self.started_reasoning_content
-                ):
-                    reasoning_content = chunk["message"].get("content")
-                    remaining_content = (
-                        chunk["message"].get("content").split("</think>")
-                    )
-                    if len(remaining_content) > 1:
-                        content = remaining_content[1]
+                if "</think>" in message_content and self.started_reasoning_content:
+                    message_content = message_content.replace("</think>", "")
                     self.finished_reasoning_content = True
 
                 if (
-                    self.started_reasoning_content is True
-                    and self.finished_reasoning_content is False
+                    self.started_reasoning_content
+                    and not self.finished_reasoning_content
                 ):
-                    reasoning_content = (
-                        chunk["message"].get("content").replace("<think>", "")
-                    )
+                    reasoning_content = message_content
                 else:
-                    content = chunk["message"].get("content")
+                    content = message_content
 
             delta = Delta(
                 content=content,
diff --git a/litellm/llms/ollama/completion/transformation.py b/litellm/llms/ollama/completion/transformation.py
@@ -19,13 +19,13 @@
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.llms.openai import AllMessageValues, ChatCompletionUsageBlock
 from litellm.types.utils import (
+    Delta,
     GenericStreamingChunk,
     ModelInfoBase,
     ModelResponse,
     ModelResponseStream,
     ProviderField,
     StreamingChoices,
-    Delta,
 )
 
 from ..common_utils import OllamaError, _convert_image
@@ -92,9 +92,9 @@ class OllamaConfig(BaseConfig):
     repeat_penalty: Optional[float] = None
     temperature: Optional[float] = None
     seed: Optional[int] = None
-    stop: Optional[
-        list
-    ] = None  # stop is a list based on this - https://github.com/ollama/ollama/pull/442
+    stop: Optional[list] = (
+        None  # stop is a list based on this - https://github.com/ollama/ollama/pull/442
+    )
     tfs_z: Optional[float] = None
     num_predict: Optional[int] = None
     top_k: Optional[int] = None
@@ -154,6 +154,7 @@ def get_supported_openai_params(self, model: str):
             "stop",
             "response_format",
             "max_completion_tokens",
+            "reasoning_effort",
         ]
 
     def map_openai_params(
@@ -166,19 +167,21 @@ def map_openai_params(
         for param, value in non_default_params.items():
             if param == "max_tokens" or param == "max_completion_tokens":
                 optional_params["num_predict"] = value
-            if param == "stream":
+            elif param == "stream":
                 optional_params["stream"] = value
-            if param == "temperature":
+            elif param == "temperature":
                 optional_params["temperature"] = value
-            if param == "seed":
+            elif param == "seed":
                 optional_params["seed"] = value
-            if param == "top_p":
+            elif param == "top_p":
                 optional_params["top_p"] = value
-            if param == "frequency_penalty":
+            elif param == "frequency_penalty":
                 optional_params["frequency_penalty"] = value
-            if param == "stop":
+            elif param == "stop":
                 optional_params["stop"] = value
-            if param == "response_format" and isinstance(value, dict):
+            elif param == "reasoning_effort" and value is not None:
+                optional_params["think"] = True
+            elif param == "response_format" and isinstance(value, dict):
                 if value["type"] == "json_object":
                     optional_params["format"] = "json"
                 elif value["type"] == "json_schema":
@@ -258,12 +261,17 @@ def transform_response(
         api_key: Optional[str] = None,
         json_mode: Optional[bool] = None,
     ) -> ModelResponse:
+        from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
+            _parse_content_for_reasoning,
+        )
+
         response_json = raw_response.json()
         ## RESPONSE OBJECT
         model_response.choices[0].finish_reason = "stop"
         if request_data.get("format", "") == "json":
             # Check if response field exists and is not empty before parsing JSON
             response_text = response_json.get("response", "")
+
             if not response_text or not response_text.strip():
                 # Handle empty response gracefully - set empty content
                 message = litellm.Message(content="")
@@ -288,7 +296,9 @@ def transform_response(
                                     "id": f"call_{str(uuid.uuid4())}",
                                     "function": {
                                         "name": function_call["name"],
-                                        "arguments": json.dumps(function_call["arguments"]),
+                                        "arguments": json.dumps(
+                                            function_call["arguments"]
+                                        ),
                                     },
                                     "type": "function",
                                 }
@@ -305,11 +315,26 @@ def transform_response(
                         model_response.choices[0].finish_reason = "stop"
                 except json.JSONDecodeError:
                     # If JSON parsing fails, treat as regular text response
-                    message = litellm.Message(content=response_text)
+                    ## output parse reasoning content from response_text
+                    reasoning_content: Optional[str] = None
+                    content: Optional[str] = None
+                    if response_text is not None:
+                        reasoning_content, content = _parse_content_for_reasoning(
+                            response_text
+                        )
+                    message = litellm.Message(
+                        content=content, reasoning_content=reasoning_content
+                    )
                     model_response.choices[0].message = message  # type: ignore
                     model_response.choices[0].finish_reason = "stop"
         else:
-            model_response.choices[0].message.content = response_json["response"]  # type: ignore
+            response_text = response_json.get("response", "")
+            content: Optional[str] = None
+            reasoning_content: Optional[str] = None
+            if response_text is not None:
+                reasoning_content, content = _parse_content_for_reasoning(response_text)
+            model_response.choices[0].message.content = content  # type: ignore
+            model_response.choices[0].message.reasoning_content = reasoning_content  # type: ignore
         model_response.created = int(time.time())
         model_response.model = "ollama/" + model
         _prompt = request_data.get("prompt", "")
@@ -434,12 +459,21 @@ def get_model_response_iterator(
 
 
 class OllamaTextCompletionResponseIterator(BaseModelResponseIterator):
+    def __init__(
+        self, streaming_response, sync_stream: bool, json_mode: Optional[bool] = False
+    ):
+        super().__init__(streaming_response, sync_stream, json_mode)
+        self.started_reasoning_content: bool = False
+        self.finished_reasoning_content: bool = False
+
     def _handle_string_chunk(
         self, str_line: str
     ) -> Union[GenericStreamingChunk, ModelResponseStream]:
         return self.chunk_parser(json.loads(str_line))
 
-    def chunk_parser(self, chunk: dict) -> Union[GenericStreamingChunk, ModelResponseStream]:
+    def chunk_parser(
+        self, chunk: dict
+    ) -> Union[GenericStreamingChunk, ModelResponseStream]:
         try:
             if "error" in chunk:
                 raise Exception(f"Ollama Error - {chunk}")
@@ -469,12 +503,42 @@ def chunk_parser(self, chunk: dict) -> Union[GenericStreamingChunk, ModelRespons
                 )
             elif chunk["response"]:
                 text = chunk["response"]
-                return GenericStreamingChunk(
-                    text=text,
-                    is_finished=is_finished,
-                    finish_reason="stop",
+                reasoning_content: Optional[str] = None
+                content: Optional[str] = None
+                if text is not None:
+                    if "<think>" in text:
+                        text = text.replace("<think>", "")
+                        self.started_reasoning_content = True
+                    elif "</think>" in text:
+                        text = text.replace("</think>", "")
+                        self.finished_reasoning_content = True
+
+                    if (
+                        self.started_reasoning_content
+                        and not self.finished_reasoning_content
+                    ):
+                        reasoning_content = text
+                    else:
+                        content = text
+
+                return ModelResponseStream(
+                    choices=[
+                        StreamingChoices(
+                            index=0,
+                            delta=Delta(
+                                reasoning_content=reasoning_content, content=content
+                            ),
+                        )
+                    ],
+                    finish_reason=finish_reason,
                     usage=None,
                 )
+                # return GenericStreamingChunk(
+                #     text=text,
+                #     is_finished=is_finished,
+                #     finish_reason="stop",
+                #     usage=None,
+                # )
             elif "thinking" in chunk and not chunk["response"]:
                 # Return reasoning content as ModelResponseStream so UIs can render it
                 thinking_content = chunk.get("thinking") or ""
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
@@ -15,7 +15,7 @@ model_list:
       mode: chat
   - model_name: ollama-deepseek-r1
     litellm_params:
-      model: ollama_chat/deepseek-r1:1.5b
+      model: ollama/deepseek-r1:1.5b
     model_info:
       mode: chat
 
diff --git a/tests/test_litellm/llms/ollama/test_ollama_completion_transformation.py b/tests/test_litellm/llms/ollama/test_ollama_completion_transformation.py