feat: add reasoning (#752)

ds-sebastianchwilczynski · web-flow · commit dfb53d19c061 · 2025-07-30T08:26:10.000+02:00
diff --git a/docs/how-to/llms/use_llms.md b/docs/how-to/llms/use_llms.md
@@ -29,7 +29,7 @@ LLMs in Ragbits allow you to customize the behavior of the model using various o
 
 ### LiteLLM Options
 
-The `LiteLLMOptions` class provides options for remote LLMs, aligning with the LiteLLM API. These options allow you to control the behavior of models from various providers. Each of the option is described in the [LiteLLM documentation](https://docs.litellm.ai/docs/completion/input).
+The `LiteLLMOptions` class provides options for remote LLMs, aligning with the LiteLLM API. These options allow you to control the behavior of models from various providers. Each of the option is described in the [LiteLLM documentation](https://docs.litellm.ai/docs/completion/input) and [Reasoning Documentation](https://docs.litellm.ai/docs/reasoning_content)
 
 Example usage:
 ```python
@@ -47,6 +47,9 @@ response = llm.generate("Write a short story about a robot learning to paint.")
 print(response)
 ```
 
+!!! warning
+    If you provide reasoning_effort to the OpenAI model, [the reasoning content will not be returned](https://platform.openai.com/docs/guides/reasoning?api-mode=responses).
+
 ## Using Local LLMs
 
 For guidance on setting up and using local models in Ragbits, refer to the [Local LLMs Guide](https://ragbits.deepsense.ai/how-to/llms/use_local_llms/).
diff --git a/examples/core/llms/reasoning.py b/examples/core/llms/reasoning.py
@@ -0,0 +1,45 @@
+"""
+Ragbits Core Example: Reasoning with LLM
+
+This example demonstrates how to use reasoning with LLM.
+
+To run the script, execute the following command:
+
+    ```bash
+    uv run examples/core/llms/reasoning.py
+    ```
+"""
+
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "ragbits-core",
+# ]
+# ///
+
+import asyncio
+
+from ragbits.core.llms import LiteLLM, LiteLLMOptions
+
+
+async def main() -> None:
+    """
+    Run the example.
+    """
+    options = LiteLLMOptions(reasoning_effort="medium")
+    model = LiteLLM(model_name="claude-3-7-sonnet-20250219", default_options=options)
+    response = await model.generate_with_metadata(
+        "Do you like Jazz?",
+    )
+    print(f"reasoning: {response.reasoning}")
+
+    options = LiteLLMOptions(thinking={"type": "enabled", "budget_tokens": 1024})
+    model = LiteLLM(model_name="claude-3-7-sonnet-20250219", default_options=options)
+    response = await model.generate_with_metadata(
+        "Do you like Jazz?",
+    )
+    print(f"reasoning: {response.reasoning}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/packages/ragbits-core/CHANGELOG.md b/packages/ragbits-core/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## Unreleased
 
+- Add support for Reasoning models (#752)
 - Fix issue with cost calculation for some models (#748)
 - Fix issue with improper convertion to json of tool call arguments (#737)
 - Added Google Drive support (#686)
diff --git a/packages/ragbits-core/src/ragbits/core/llms/base.py b/packages/ragbits-core/src/ragbits/core/llms/base.py
@@ -197,13 +197,18 @@ def __repr__(self) -> str:
         )
 
 
+class Reasoning(str):
+    """A class for reasoning streaming"""
+
+
 class LLMResponseWithMetadata(BaseModel, Generic[PromptOutputT]):
     """
     A schema of output with metadata
     """
 
     content: PromptOutputT
     metadata: dict = {}
+    reasoning: str | None = None
     tool_calls: list[ToolCall] | None = None
     usage: Usage | None = None
 
@@ -571,12 +576,14 @@ async def generate_with_metadata(
                     )
 
                 content = response.pop("response")
+                reasoning = response.pop("reasoning", None)
 
                 if isinstance(prompt, BasePromptWithParser) and content:
                     content = await prompt.parse_response(content)
 
                 response_with_metadata = LLMResponseWithMetadata[type(content)](  # type: ignore
                     content=content,
+                    reasoning=reasoning,
                     tool_calls=tool_calls,
                     metadata=response,
                     usage=usage,
@@ -623,7 +630,7 @@ def generate_streaming(
         *,
         tools: None = None,
         options: LLMClientOptionsT | None = None,
-    ) -> LLMResultStreaming[str]: ...
+    ) -> LLMResultStreaming[str | Reasoning]: ...
 
     @overload
     def generate_streaming(
@@ -632,7 +639,7 @@ def generate_streaming(
         *,
         tools: list[Tool],
         options: LLMClientOptionsT | None = None,
-    ) -> LLMResultStreaming[str | ToolCall]: ...
+    ) -> LLMResultStreaming[str | Reasoning | ToolCall]: ...
 
     def generate_streaming(
         self,
@@ -661,7 +668,7 @@ async def _stream_internal(
         *,
         tools: list[Tool] | None = None,
         options: LLMClientOptionsT | None = None,
-    ) -> AsyncGenerator[str | ToolCall | LLMResponseWithMetadata, None]:
+    ) -> AsyncGenerator[str | Reasoning | ToolCall | LLMResponseWithMetadata, None]:
         with trace(model_name=self.model_name, prompt=prompt, options=repr(options)) as outputs:
             merged_options = (self.default_options | options) if options else self.default_options
             if isinstance(prompt, str | list):
@@ -679,12 +686,17 @@ async def _stream_internal(
             )
 
             content = ""
+            reasoning = ""
             tool_calls = []
             usage_data = {}
             async for chunk in response:
                 if text := chunk.get("response"):
-                    content += text
-                    yield text
+                    if chunk.get("reasoning"):
+                        reasoning += text
+                        yield Reasoning(text)
+                    else:
+                        content += text
+                        yield text
 
                 if tools and (_tool_calls := chunk.get("tool_calls")):
                     for tool_call in _tool_calls:
@@ -706,6 +718,7 @@ async def _stream_internal(
 
             outputs.response = LLMResponseWithMetadata[type(content or None)](  # type: ignore
                 content=content or None,
+                reasoning=reasoning or None,
                 tool_calls=tool_calls or None,
                 usage=usage,
             )
diff --git a/packages/ragbits-core/src/ragbits/core/llms/exceptions.py b/packages/ragbits-core/src/ragbits/core/llms/exceptions.py
@@ -70,3 +70,12 @@ class LLMNotSupportingToolUseError(LLMError):
 
     def __init__(self, message: str = "There are tools provided, but given LLM doesn't support tool use.") -> None:
         super().__init__(message)
+
+
+class LLMNotSupportingReasoningEffortError(LLMError):
+    """
+    Raised when there is reasoning effort provided, but LLM doesn't support it.
+    """
+
+    def __init__(self, model_name: str) -> None:
+        super().__init__(f"Model {model_name} does not support reasoning effort.")
diff --git a/packages/ragbits-core/src/ragbits/core/llms/litellm.py b/packages/ragbits-core/src/ragbits/core/llms/litellm.py
@@ -1,7 +1,7 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator, Callable, Iterable
-from typing import Any
+from typing import Any, Literal
 
 import litellm
 import tiktoken
@@ -17,6 +17,7 @@
     LLMEmptyResponseError,
     LLMNotSupportingImagesError,
     LLMNotSupportingPdfsError,
+    LLMNotSupportingReasoningEffortError,
     LLMNotSupportingToolUseError,
     LLMResponseError,
     LLMStatusError,
@@ -29,6 +30,7 @@ class LiteLLMOptions(LLMOptions):
     """
     Dataclass that represents all available LLM call options for the LiteLLM client.
     Each of them is described in the [LiteLLM documentation](https://docs.litellm.ai/docs/completion/input).
+    Reasoning effort and thinking are described in [LiteLLM Reasoning documentation](https://docs.litellm.ai/docs/reasoning_content)
     """
 
     frequency_penalty: float | None | NotGiven = NOT_GIVEN
@@ -45,6 +47,8 @@ class LiteLLMOptions(LLMOptions):
     mock_response: str | None | NotGiven = NOT_GIVEN
     tpm: int | None | NotGiven = NOT_GIVEN
     rpm: int | None | NotGiven = NOT_GIVEN
+    reasoning_effort: Literal["low", "medium", "high"] | None | NotGiven = NOT_GIVEN
+    thinking: dict | None | NotGiven = NOT_GIVEN
 
 
 class LiteLLM(LLM[LiteLLMOptions]):
@@ -185,6 +189,9 @@ async def _call(
         if tools and not litellm.supports_function_calling(self.model_name):
             raise LLMNotSupportingToolUseError()
 
+        if options.reasoning_effort and not litellm.supports_reasoning(self.model_name):
+            raise LLMNotSupportingReasoningEffortError(self.model_name)
+
         start_time = time.perf_counter()
         raw_responses = await asyncio.gather(
             *(
@@ -209,6 +216,7 @@ async def _call(
 
             result = {}
             result["response"] = response.choices[0].message.content  # type: ignore
+            result["reasoning"] = getattr(response.choices[0].message, "reasoning_content", None)  # type: ignore
             result["throughput"] = throughput_batch / float(len(raw_responses))
 
             result["tool_calls"] = (
@@ -274,6 +282,9 @@ async def _call_streaming(
         if tools and not litellm.supports_function_calling(self.model_name):
             raise LLMNotSupportingToolUseError()
 
+        if options.reasoning_effort and not litellm.supports_reasoning(self.model_name):
+            raise LLMNotSupportingReasoningEffortError(self.model_name)
+
         response_format = self._get_response_format(output_schema=prompt.output_schema(), json_mode=prompt.json_mode)
         input_tokens = self.count_tokens(prompt)
 
@@ -288,7 +299,6 @@ async def _call_streaming(
             stream=True,
             stream_options={"include_usage": True},
         )
-
         if not response.completion_stream and not response.choices:  # type: ignore
             raise LLMEmptyResponseError()
 
@@ -298,7 +308,8 @@ async def response_to_async_generator(response: CustomStreamWrapper) -> AsyncGen
             tool_calls: list[dict] = []
 
             async for item in response:
-                if content := item.choices[0].delta.content:
+                reasoning_content = getattr(item.choices[0].delta, "reasoning_content", None)
+                if content := item.choices[0].delta.content or reasoning_content:
                     output_tokens += 1
                     if output_tokens == 1:
                         record_metric(
@@ -308,7 +319,8 @@ async def response_to_async_generator(response: CustomStreamWrapper) -> AsyncGen
                             model=self.model_name,
                             prompt=prompt.__class__.__name__,
                         )
-                    yield {"response": content}
+
+                    yield {"response": content, "reasoning": bool(reasoning_content)}
 
                 if tool_calls_delta := item.choices[0].delta.tool_calls:
                     for tool_call_chunk in tool_calls_delta:
@@ -412,6 +424,12 @@ async def _get_litellm_response(
             **options.dict(),
         }
 
+        supported_openai_params = litellm.get_supported_openai_params(model=self.model_name) or []
+        if "reasoning_effort" not in supported_openai_params:
+            completion_kwargs.pop("reasoning_effort")
+        if "thinking" not in supported_openai_params:
+            completion_kwargs.pop("thinking")
+
         if stream_options is not None:
             completion_kwargs["stream_options"] = stream_options
 
diff --git a/packages/ragbits-core/src/ragbits/core/llms/local.py b/packages/ragbits-core/src/ragbits/core/llms/local.py
@@ -157,6 +157,7 @@ async def _call(
         for i, response in enumerate(responses):
             result = {}
             result["response"] = self.tokenizer.decode(response, skip_special_tokens=True)
+            result["reasoning"] = None
             prompt_tokens = tokens_in[i]
             completion_tokens = sum(response != self.tokenizer._pad_token_type_id)
             result["usage"] = {
@@ -222,7 +223,7 @@ async def streamer_to_async_generator(
                             prompt=prompt.__class__.__name__,
                         )
 
-                yield {"response": text}
+                yield {"response": text, "reasoning": False}
                 await asyncio.sleep(0.0)
 
             generation_thread.join()
diff --git a/packages/ragbits-core/src/ragbits/core/llms/mock.py b/packages/ragbits-core/src/ragbits/core/llms/mock.py
@@ -14,6 +14,8 @@ class MockLLMOptions(LLMOptions):
     response: str | NotGiven = NOT_GIVEN
     response_stream: list[str] | NotGiven = NOT_GIVEN
     tool_calls: list[dict] | NotGiven = NOT_GIVEN
+    reasoning: str | NotGiven = NOT_GIVEN
+    reasoning_stream: list[str] | NotGiven = NOT_GIVEN
 
 
 class MockLLM(LLM[MockLLMOptions]):
@@ -69,6 +71,7 @@ async def _call(  # noqa: PLR6301
         prompt = list(prompt)
         self.calls.extend([p.chat for p in prompt])
         response = "mocked response" if isinstance(options.response, NotGiven) else options.response
+        reasoning = None if isinstance(options.reasoning, NotGiven) else options.reasoning
         tool_calls = (
             None
             if isinstance(options.tool_calls, NotGiven)
@@ -78,6 +81,7 @@ async def _call(  # noqa: PLR6301
         return [
             {
                 "response": response,
+                "reasoning": reasoning,
                 "tool_calls": tool_calls,
                 "is_mocked": True,
                 "throughput": 1 / len(prompt),
@@ -107,10 +111,16 @@ async def generator() -> AsyncGenerator[dict, None]:
             ):
                 yield {"tool_calls": options.tool_calls}
             elif not isinstance(options.response_stream, NotGiven):
+                if not isinstance(options.reasoning_stream, NotGiven):
+                    for reasoning in options.reasoning_stream:
+                        yield {"response": reasoning, "reasoning": True}
                 for response in options.response_stream:
                     yield {"response": response}
             elif not isinstance(options.response, NotGiven):
+                if not isinstance(options.reasoning, NotGiven):
+                    yield {"response": options.reasoning, "reasoning": True}
                 yield {"response": options.response}
+
             else:
                 yield {"response": "mocked response"}
 
diff --git a/packages/ragbits-core/tests/unit/llms/test_base.py b/packages/ragbits-core/tests/unit/llms/test_base.py
diff --git a/packages/ragbits-core/tests/unit/llms/test_litellm.py b/packages/ragbits-core/tests/unit/llms/test_litellm.py
diff --git a/packages/ragbits-core/tests/unit/llms/test_local.py b/packages/ragbits-core/tests/unit/llms/test_local.py