Skip to content

Commit 2ad77d9

Browse files
feat(ollama/completion): output parse thinking content on streaming + non-streaming for ollama completion calls
Completes 'thinking' param support for ollama
1 parent e6429f6 commit 2ad77d9

File tree

4 files changed

+353
-44
lines changed

4 files changed

+353
-44
lines changed

litellm/llms/ollama/chat/transformation.py

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -504,34 +504,23 @@ def chunk_parser(self, chunk: dict) -> ModelResponseStream:
504504
reasoning_content = chunk["message"].get("thinking")
505505
self.finished_reasoning_content = True
506506
elif chunk["message"].get("content") is not None:
507-
if "<think>" in chunk["message"].get("content"):
508-
reasoning_content = (
509-
chunk["message"].get("content").replace("<think>", "")
510-
)
507+
message_content = chunk["message"].get("content")
508+
if "<think>" in message_content:
509+
message_content = message_content.replace("<think>", "")
511510

512511
self.started_reasoning_content = True
513512

514-
if (
515-
"</think>" in chunk["message"].get("content")
516-
and self.started_reasoning_content
517-
):
518-
reasoning_content = chunk["message"].get("content")
519-
remaining_content = (
520-
chunk["message"].get("content").split("</think>")
521-
)
522-
if len(remaining_content) > 1:
523-
content = remaining_content[1]
513+
if "</think>" in message_content and self.started_reasoning_content:
514+
message_content = message_content.replace("</think>", "")
524515
self.finished_reasoning_content = True
525516

526517
if (
527-
self.started_reasoning_content is True
528-
and self.finished_reasoning_content is False
518+
self.started_reasoning_content
519+
and not self.finished_reasoning_content
529520
):
530-
reasoning_content = (
531-
chunk["message"].get("content").replace("<think>", "")
532-
)
521+
reasoning_content = message_content
533522
else:
534-
content = chunk["message"].get("content")
523+
content = message_content
535524

536525
delta = Delta(
537526
content=content,

litellm/llms/ollama/completion/transformation.py

Lines changed: 83 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@
1919
from litellm.secret_managers.main import get_secret_str
2020
from litellm.types.llms.openai import AllMessageValues, ChatCompletionUsageBlock
2121
from litellm.types.utils import (
22+
Delta,
2223
GenericStreamingChunk,
2324
ModelInfoBase,
2425
ModelResponse,
2526
ModelResponseStream,
2627
ProviderField,
2728
StreamingChoices,
28-
Delta,
2929
)
3030

3131
from ..common_utils import OllamaError, _convert_image
@@ -92,9 +92,9 @@ class OllamaConfig(BaseConfig):
9292
repeat_penalty: Optional[float] = None
9393
temperature: Optional[float] = None
9494
seed: Optional[int] = None
95-
stop: Optional[
96-
list
97-
] = None # stop is a list based on this - https://github.com/ollama/ollama/pull/442
95+
stop: Optional[list] = (
96+
None # stop is a list based on this - https://github.com/ollama/ollama/pull/442
97+
)
9898
tfs_z: Optional[float] = None
9999
num_predict: Optional[int] = None
100100
top_k: Optional[int] = None
@@ -154,6 +154,7 @@ def get_supported_openai_params(self, model: str):
154154
"stop",
155155
"response_format",
156156
"max_completion_tokens",
157+
"reasoning_effort",
157158
]
158159

159160
def map_openai_params(
@@ -166,19 +167,21 @@ def map_openai_params(
166167
for param, value in non_default_params.items():
167168
if param == "max_tokens" or param == "max_completion_tokens":
168169
optional_params["num_predict"] = value
169-
if param == "stream":
170+
elif param == "stream":
170171
optional_params["stream"] = value
171-
if param == "temperature":
172+
elif param == "temperature":
172173
optional_params["temperature"] = value
173-
if param == "seed":
174+
elif param == "seed":
174175
optional_params["seed"] = value
175-
if param == "top_p":
176+
elif param == "top_p":
176177
optional_params["top_p"] = value
177-
if param == "frequency_penalty":
178+
elif param == "frequency_penalty":
178179
optional_params["frequency_penalty"] = value
179-
if param == "stop":
180+
elif param == "stop":
180181
optional_params["stop"] = value
181-
if param == "response_format" and isinstance(value, dict):
182+
elif param == "reasoning_effort" and value is not None:
183+
optional_params["think"] = True
184+
elif param == "response_format" and isinstance(value, dict):
182185
if value["type"] == "json_object":
183186
optional_params["format"] = "json"
184187
elif value["type"] == "json_schema":
@@ -258,12 +261,17 @@ def transform_response(
258261
api_key: Optional[str] = None,
259262
json_mode: Optional[bool] = None,
260263
) -> ModelResponse:
264+
from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
265+
_parse_content_for_reasoning,
266+
)
267+
261268
response_json = raw_response.json()
262269
## RESPONSE OBJECT
263270
model_response.choices[0].finish_reason = "stop"
264271
if request_data.get("format", "") == "json":
265272
# Check if response field exists and is not empty before parsing JSON
266273
response_text = response_json.get("response", "")
274+
267275
if not response_text or not response_text.strip():
268276
# Handle empty response gracefully - set empty content
269277
message = litellm.Message(content="")
@@ -288,7 +296,9 @@ def transform_response(
288296
"id": f"call_{str(uuid.uuid4())}",
289297
"function": {
290298
"name": function_call["name"],
291-
"arguments": json.dumps(function_call["arguments"]),
299+
"arguments": json.dumps(
300+
function_call["arguments"]
301+
),
292302
},
293303
"type": "function",
294304
}
@@ -305,11 +315,26 @@ def transform_response(
305315
model_response.choices[0].finish_reason = "stop"
306316
except json.JSONDecodeError:
307317
# If JSON parsing fails, treat as regular text response
308-
message = litellm.Message(content=response_text)
318+
## output parse reasoning content from response_text
319+
reasoning_content: Optional[str] = None
320+
content: Optional[str] = None
321+
if response_text is not None:
322+
reasoning_content, content = _parse_content_for_reasoning(
323+
response_text
324+
)
325+
message = litellm.Message(
326+
content=content, reasoning_content=reasoning_content
327+
)
309328
model_response.choices[0].message = message # type: ignore
310329
model_response.choices[0].finish_reason = "stop"
311330
else:
312-
model_response.choices[0].message.content = response_json["response"] # type: ignore
331+
response_text = response_json.get("response", "")
332+
content: Optional[str] = None
333+
reasoning_content: Optional[str] = None
334+
if response_text is not None:
335+
reasoning_content, content = _parse_content_for_reasoning(response_text)
336+
model_response.choices[0].message.content = content # type: ignore
337+
model_response.choices[0].message.reasoning_content = reasoning_content # type: ignore
313338
model_response.created = int(time.time())
314339
model_response.model = "ollama/" + model
315340
_prompt = request_data.get("prompt", "")
@@ -434,12 +459,21 @@ def get_model_response_iterator(
434459

435460

436461
class OllamaTextCompletionResponseIterator(BaseModelResponseIterator):
462+
def __init__(
463+
self, streaming_response, sync_stream: bool, json_mode: Optional[bool] = False
464+
):
465+
super().__init__(streaming_response, sync_stream, json_mode)
466+
self.started_reasoning_content: bool = False
467+
self.finished_reasoning_content: bool = False
468+
437469
def _handle_string_chunk(
438470
self, str_line: str
439471
) -> Union[GenericStreamingChunk, ModelResponseStream]:
440472
return self.chunk_parser(json.loads(str_line))
441473

442-
def chunk_parser(self, chunk: dict) -> Union[GenericStreamingChunk, ModelResponseStream]:
474+
def chunk_parser(
475+
self, chunk: dict
476+
) -> Union[GenericStreamingChunk, ModelResponseStream]:
443477
try:
444478
if "error" in chunk:
445479
raise Exception(f"Ollama Error - {chunk}")
@@ -469,12 +503,42 @@ def chunk_parser(self, chunk: dict) -> Union[GenericStreamingChunk, ModelRespons
469503
)
470504
elif chunk["response"]:
471505
text = chunk["response"]
472-
return GenericStreamingChunk(
473-
text=text,
474-
is_finished=is_finished,
475-
finish_reason="stop",
506+
reasoning_content: Optional[str] = None
507+
content: Optional[str] = None
508+
if text is not None:
509+
if "<think>" in text:
510+
text = text.replace("<think>", "")
511+
self.started_reasoning_content = True
512+
elif "</think>" in text:
513+
text = text.replace("</think>", "")
514+
self.finished_reasoning_content = True
515+
516+
if (
517+
self.started_reasoning_content
518+
and not self.finished_reasoning_content
519+
):
520+
reasoning_content = text
521+
else:
522+
content = text
523+
524+
return ModelResponseStream(
525+
choices=[
526+
StreamingChoices(
527+
index=0,
528+
delta=Delta(
529+
reasoning_content=reasoning_content, content=content
530+
),
531+
)
532+
],
533+
finish_reason=finish_reason,
476534
usage=None,
477535
)
536+
# return GenericStreamingChunk(
537+
# text=text,
538+
# is_finished=is_finished,
539+
# finish_reason="stop",
540+
# usage=None,
541+
# )
478542
elif "thinking" in chunk and not chunk["response"]:
479543
# Return reasoning content as ModelResponseStream so UIs can render it
480544
thinking_content = chunk.get("thinking") or ""

litellm/proxy/_new_secret_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ model_list:
1515
mode: chat
1616
- model_name: ollama-deepseek-r1
1717
litellm_params:
18-
model: ollama_chat/deepseek-r1:1.5b
18+
model: ollama/deepseek-r1:1.5b
1919
model_info:
2020
mode: chat
2121

0 commit comments

Comments
 (0)