Skip to content

Commit 359780b

Browse files
Merge pull request #14121 from BerriAI/litellm_dev_08_31_2025_p1
VLLM - handle output parsing responses api output + Ollama - add unified 'thinking' param support (via `reasoning_content`)
2 parents 6d36219 + 82d6c4d commit 359780b

File tree

7 files changed

+443
-61
lines changed

7 files changed

+443
-61
lines changed

litellm/llms/ollama/chat/transformation.py

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ def get_supported_openai_params(self, model: str):
137137
"tool_choice",
138138
"functions",
139139
"response_format",
140+
"reasoning_effort",
140141
]
141142

142143
def map_openai_params(
@@ -175,6 +176,8 @@ def map_openai_params(
175176
if value.get("json_schema") and value["json_schema"].get("schema"):
176177
optional_params["format"] = value["json_schema"]["schema"]
177178
### FUNCTION CALLING LOGIC ###
179+
if param == "reasoning_effort" and value is not None:
180+
optional_params["think"] = True
178181
if param == "tools":
179182
## CHECK IF MODEL SUPPORTS TOOL CALLING ##
180183
try:
@@ -212,9 +215,9 @@ def map_openai_params(
212215
litellm.add_function_to_prompt = (
213216
True # so that main.py adds the function call to the prompt
214217
)
215-
optional_params[
216-
"functions_unsupported_model"
217-
] = non_default_params.get("functions")
218+
optional_params["functions_unsupported_model"] = (
219+
non_default_params.get("functions")
220+
)
218221
non_default_params.pop("tool_choice", None) # causes ollama requests to hang
219222
non_default_params.pop("functions", None) # causes ollama requests to hang
220223
return optional_params
@@ -346,11 +349,31 @@ def transform_response(
346349

347350
## RESPONSE OBJECT
348351
model_response.choices[0].finish_reason = "stop"
352+
response_json_message = response_json.get("message")
353+
if response_json_message is not None:
354+
if "thinking" in response_json_message:
355+
# remap 'thinking' to 'reasoning_content'
356+
response_json_message["reasoning_content"] = response_json_message[
357+
"thinking"
358+
]
359+
del response_json_message["thinking"]
360+
elif response_json_message.get("content") is not None:
361+
# parse reasoning content from content
362+
from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
363+
_parse_content_for_reasoning,
364+
)
365+
366+
reasoning_content, content = _parse_content_for_reasoning(
367+
response_json_message["content"]
368+
)
369+
response_json_message["reasoning_content"] = reasoning_content
370+
response_json_message["content"] = content
371+
349372
if (
350373
request_data.get("format", "") == "json"
351374
and litellm_params.get("function_name") is not None
352375
):
353-
function_call = json.loads(response_json["message"]["content"])
376+
function_call = json.loads(response_json_message["content"])
354377
message = litellm.Message(
355378
content=None,
356379
tool_calls=[
@@ -367,11 +390,13 @@ def transform_response(
367390
"type": "function",
368391
}
369392
],
393+
reasoning_content=response_json_message.get("reasoning_content"),
370394
)
371395
model_response.choices[0].message = message # type: ignore
372396
model_response.choices[0].finish_reason = "tool_calls"
373397
else:
374-
_message = litellm.Message(**response_json["message"])
398+
399+
_message = litellm.Message(**response_json_message)
375400
model_response.choices[0].message = _message # type: ignore
376401
model_response.created = int(time.time())
377402
model_response.model = "ollama_chat/" + model
@@ -412,6 +437,9 @@ def get_model_response_iterator(
412437

413438

414439
class OllamaChatCompletionResponseIterator(BaseModelResponseIterator):
440+
started_reasoning_content: bool = False
441+
finished_reasoning_content: bool = False
442+
415443
def _is_function_call_complete(self, function_args: Union[str, dict]) -> bool:
416444
if isinstance(function_args, dict):
417445
return True
@@ -465,8 +493,38 @@ def chunk_parser(self, chunk: dict) -> ModelResponseStream:
465493
if is_function_call_complete:
466494
tool_call["id"] = str(uuid.uuid4())
467495

496+
# PROCESS REASONING CONTENT
497+
reasoning_content: Optional[str] = None
498+
content: Optional[str] = None
499+
if chunk["message"].get("thinking") is not None:
500+
if self.started_reasoning_content is False:
501+
reasoning_content = chunk["message"].get("thinking")
502+
self.started_reasoning_content = True
503+
elif self.finished_reasoning_content is False:
504+
reasoning_content = chunk["message"].get("thinking")
505+
self.finished_reasoning_content = True
506+
elif chunk["message"].get("content") is not None:
507+
message_content = chunk["message"].get("content")
508+
if "<think>" in message_content:
509+
message_content = message_content.replace("<think>", "")
510+
511+
self.started_reasoning_content = True
512+
513+
if "</think>" in message_content and self.started_reasoning_content:
514+
message_content = message_content.replace("</think>", "")
515+
self.finished_reasoning_content = True
516+
517+
if (
518+
self.started_reasoning_content
519+
and not self.finished_reasoning_content
520+
):
521+
reasoning_content = message_content
522+
else:
523+
content = message_content
524+
468525
delta = Delta(
469-
content=chunk["message"].get("content", ""),
526+
content=content,
527+
reasoning_content=reasoning_content,
470528
tool_calls=tool_calls,
471529
)
472530

litellm/llms/ollama/completion/transformation.py

Lines changed: 83 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@
1919
from litellm.secret_managers.main import get_secret_str
2020
from litellm.types.llms.openai import AllMessageValues, ChatCompletionUsageBlock
2121
from litellm.types.utils import (
22+
Delta,
2223
GenericStreamingChunk,
2324
ModelInfoBase,
2425
ModelResponse,
2526
ModelResponseStream,
2627
ProviderField,
2728
StreamingChoices,
28-
Delta,
2929
)
3030

3131
from ..common_utils import OllamaError, _convert_image
@@ -92,9 +92,9 @@ class OllamaConfig(BaseConfig):
9292
repeat_penalty: Optional[float] = None
9393
temperature: Optional[float] = None
9494
seed: Optional[int] = None
95-
stop: Optional[
96-
list
97-
] = None # stop is a list based on this - https://github.com/ollama/ollama/pull/442
95+
stop: Optional[list] = (
96+
None # stop is a list based on this - https://github.com/ollama/ollama/pull/442
97+
)
9898
tfs_z: Optional[float] = None
9999
num_predict: Optional[int] = None
100100
top_k: Optional[int] = None
@@ -154,6 +154,7 @@ def get_supported_openai_params(self, model: str):
154154
"stop",
155155
"response_format",
156156
"max_completion_tokens",
157+
"reasoning_effort",
157158
]
158159

159160
def map_openai_params(
@@ -166,19 +167,21 @@ def map_openai_params(
166167
for param, value in non_default_params.items():
167168
if param == "max_tokens" or param == "max_completion_tokens":
168169
optional_params["num_predict"] = value
169-
if param == "stream":
170+
elif param == "stream":
170171
optional_params["stream"] = value
171-
if param == "temperature":
172+
elif param == "temperature":
172173
optional_params["temperature"] = value
173-
if param == "seed":
174+
elif param == "seed":
174175
optional_params["seed"] = value
175-
if param == "top_p":
176+
elif param == "top_p":
176177
optional_params["top_p"] = value
177-
if param == "frequency_penalty":
178+
elif param == "frequency_penalty":
178179
optional_params["frequency_penalty"] = value
179-
if param == "stop":
180+
elif param == "stop":
180181
optional_params["stop"] = value
181-
if param == "response_format" and isinstance(value, dict):
182+
elif param == "reasoning_effort" and value is not None:
183+
optional_params["think"] = True
184+
elif param == "response_format" and isinstance(value, dict):
182185
if value["type"] == "json_object":
183186
optional_params["format"] = "json"
184187
elif value["type"] == "json_schema":
@@ -258,12 +261,17 @@ def transform_response(
258261
api_key: Optional[str] = None,
259262
json_mode: Optional[bool] = None,
260263
) -> ModelResponse:
264+
from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
265+
_parse_content_for_reasoning,
266+
)
267+
261268
response_json = raw_response.json()
262269
## RESPONSE OBJECT
263270
model_response.choices[0].finish_reason = "stop"
264271
if request_data.get("format", "") == "json":
265272
# Check if response field exists and is not empty before parsing JSON
266273
response_text = response_json.get("response", "")
274+
267275
if not response_text or not response_text.strip():
268276
# Handle empty response gracefully - set empty content
269277
message = litellm.Message(content="")
@@ -288,7 +296,9 @@ def transform_response(
288296
"id": f"call_{str(uuid.uuid4())}",
289297
"function": {
290298
"name": function_call["name"],
291-
"arguments": json.dumps(function_call["arguments"]),
299+
"arguments": json.dumps(
300+
function_call["arguments"]
301+
),
292302
},
293303
"type": "function",
294304
}
@@ -305,11 +315,26 @@ def transform_response(
305315
model_response.choices[0].finish_reason = "stop"
306316
except json.JSONDecodeError:
307317
# If JSON parsing fails, treat as regular text response
308-
message = litellm.Message(content=response_text)
318+
## output parse reasoning content from response_text
319+
reasoning_content: Optional[str] = None
320+
content: Optional[str] = None
321+
if response_text is not None:
322+
reasoning_content, content = _parse_content_for_reasoning(
323+
response_text
324+
)
325+
message = litellm.Message(
326+
content=content, reasoning_content=reasoning_content
327+
)
309328
model_response.choices[0].message = message # type: ignore
310329
model_response.choices[0].finish_reason = "stop"
311330
else:
312-
model_response.choices[0].message.content = response_json["response"] # type: ignore
331+
response_text = response_json.get("response", "")
332+
content = None
333+
reasoning_content = None
334+
if response_text is not None:
335+
reasoning_content, content = _parse_content_for_reasoning(response_text)
336+
model_response.choices[0].message.content = content # type: ignore
337+
model_response.choices[0].message.reasoning_content = reasoning_content # type: ignore
313338
model_response.created = int(time.time())
314339
model_response.model = "ollama/" + model
315340
_prompt = request_data.get("prompt", "")
@@ -434,12 +459,21 @@ def get_model_response_iterator(
434459

435460

436461
class OllamaTextCompletionResponseIterator(BaseModelResponseIterator):
462+
def __init__(
463+
self, streaming_response, sync_stream: bool, json_mode: Optional[bool] = False
464+
):
465+
super().__init__(streaming_response, sync_stream, json_mode)
466+
self.started_reasoning_content: bool = False
467+
self.finished_reasoning_content: bool = False
468+
437469
def _handle_string_chunk(
438470
self, str_line: str
439471
) -> Union[GenericStreamingChunk, ModelResponseStream]:
440472
return self.chunk_parser(json.loads(str_line))
441473

442-
def chunk_parser(self, chunk: dict) -> Union[GenericStreamingChunk, ModelResponseStream]:
474+
def chunk_parser(
475+
self, chunk: dict
476+
) -> Union[GenericStreamingChunk, ModelResponseStream]:
443477
try:
444478
if "error" in chunk:
445479
raise Exception(f"Ollama Error - {chunk}")
@@ -469,12 +503,42 @@ def chunk_parser(self, chunk: dict) -> Union[GenericStreamingChunk, ModelRespons
469503
)
470504
elif chunk["response"]:
471505
text = chunk["response"]
472-
return GenericStreamingChunk(
473-
text=text,
474-
is_finished=is_finished,
475-
finish_reason="stop",
506+
reasoning_content: Optional[str] = None
507+
content: Optional[str] = None
508+
if text is not None:
509+
if "<think>" in text:
510+
text = text.replace("<think>", "")
511+
self.started_reasoning_content = True
512+
elif "</think>" in text:
513+
text = text.replace("</think>", "")
514+
self.finished_reasoning_content = True
515+
516+
if (
517+
self.started_reasoning_content
518+
and not self.finished_reasoning_content
519+
):
520+
reasoning_content = text
521+
else:
522+
content = text
523+
524+
return ModelResponseStream(
525+
choices=[
526+
StreamingChoices(
527+
index=0,
528+
delta=Delta(
529+
reasoning_content=reasoning_content, content=content
530+
),
531+
)
532+
],
533+
finish_reason=finish_reason,
476534
usage=None,
477535
)
536+
# return GenericStreamingChunk(
537+
# text=text,
538+
# is_finished=is_finished,
539+
# finish_reason="stop",
540+
# usage=None,
541+
# )
478542
elif "thinking" in chunk and not chunk["response"]:
479543
# Return reasoning content as ModelResponseStream so UIs can render it
480544
thinking_content = chunk.get("thinking") or ""
File renamed without changes.

litellm/proxy/_experimental/out/onboarding.html

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)