doofinder · sonic182 · Jul 31, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/README.md b/README.md
@@ -15,6 +15,26 @@ def deps do
 end
 ```
 
+## Provider Compatibility
+
+The following table shows which features are supported by each provider:
+
+| Feature | OpenAI | OpenRouter | Ollama | Bedrock |
+|---------|--------|------------|--------|---------|
+| Basic Chat | ✅ | ✅ | ✅ | ✅ |
+| Streaming | ✅ | ✅ | ✅ | ❌ |
+| Function Calls | ✅ | ✅ | ❌ | ❌ |
+| Auto Function Execution | ✅ | ✅ | ❌ | ❌ |
+| Fallback Models | ❌ | ✅ | ❌ | ❌ |
+| Provider Routing | ❌ | ✅ | ❌ | ❌ |
+
+### Notes:
+- **OpenRouter** offers the most comprehensive feature set, including unique capabilities like fallback models and provider routing
+- **Bedrock** support is provided via AWS ExAws integration and requires proper AWS configuration
+- **Ollama** requires an ollama server instance to be running
+- **Function Calls** require the provider to support OpenAI-compatible function calling format
+- **Streaming** is **not** compatible with Tesla **retries**.
+
 ## Usage
 
 ### Simple Bot Definition
@@ -150,7 +170,65 @@ LlmComposer.Message.new(
 )
 ```
 
-No function calls support in Ollama (for now)
+**Note:** Ollama does not provide token usage information, so `input_tokens` and `output_tokens` will always be empty in debug logs and response metadata. Function calls are also not supported with Ollama.
+
+### Streaming Responses
+
+LlmComposer supports streaming responses for real-time output, which is particularly useful for long-form content generation. This feature works with providers that support streaming (like Ollama, OpenRouter and OpenAI).
+
+```elixir
+# Make sure to configure Tesla adapter for streaming (Finch recommended)
+Application.put_env(:llm_composer, :tesla_adapter, {Tesla.Adapter.Finch, name: MyFinch})
+{:ok, finch} = Finch.start_link(name: MyFinch)
+
+defmodule MyStreamingChat do
+  @settings %LlmComposer.Settings{
+    provider: LlmComposer.Providers.Ollama,
+    provider_opts: [model: "llama3.2"],
+    system_prompt: "You are a creative storyteller.",
+    stream_response: true
+  }
+
+  def run_streaming_chat() do
+    messages = [
+      %LlmComposer.Message{type: :user, content: "Tell me a short story about space exploration"}
+    ]
+
+    {:ok, res} = LlmComposer.run_completion(@settings, messages)
+
+    # Process the stream and output content in real-time
+    res.stream
+    |> LlmComposer.parse_stream_response()
+    |> Enum.each(fn parsed_data ->
+      content = get_in(parsed_data, ["message", "content"]) || ""
+      if content != "", do: IO.write(content)
+    end)
+
+    IO.puts("\n--- Stream complete ---")
+  end
+end
+
+MyStreamingChat.run_streaming_chat()
+```
+
+Example of execution:
+
+```
+mix run streaming_sample.ex
+
+Once upon a time, in the vast expanse of space, a brave astronaut embarked on a journey to explore distant galaxies. The stars shimmered as the spaceship soared beyond the known universe, uncovering secrets of the cosmos...
+
+--- Stream complete ---
+```
+
+**Note:** The `stream_response: true` setting enables streaming mode, and `parse_stream_response/1` filters and parses the raw stream data into usable content chunks.
+
+**Important:** When using Stream read chat completion, LlmComposer does not track input/output/cache/thinking tokens. There are two approaches to handle token counting in this mode:
+
+1. Calculate tokens using libraries like `tiktoken` for OpenAI provider.
+2. Read token data from the last stream object if the provider supplies it (currently only OpenRouter supports this).
+
+In Ollama provider, we do not track tokens.
 
 ### Using OpenRouter
 
@@ -334,4 +412,3 @@ In this example, the bot first calls OpenAI to understand the user's intent and
 Documentation can be generated with [ExDoc](https://github.com/elixir-lang/ex_doc)
 and published on [HexDocs](https://hexdocs.pm). Once published, the docs can
 be found at <https://hexdocs.pm/llm_composer>.
-
diff --git a/config/config.exs b/config/config.exs
@@ -4,6 +4,7 @@ config :llm_composer,
   openai_key: "",
   ollama_uri: "http://localhost:11434",
   open_router_key: "",
+  tesla_adapter: nil,
   timeout: nil
 
 import_config "#{Mix.env()}.exs"
diff --git a/lib/llm_composer.ex b/lib/llm_composer.ex
@@ -87,6 +87,7 @@ defmodule LlmComposer do
     provider_opts =
       Keyword.merge(settings.provider_opts,
         functions: settings.functions,
+        stream_response: settings.stream_response,
         api_key: settings.api_key
       )
 
@@ -110,6 +111,59 @@ defmodule LlmComposer do
     end)
   end
 
+  @doc """
+  Processes a raw stream response and returns a parsed stream of message content.
+
+  ## Parameters
+    - `stream`: The raw stream object from the LLM response.
+
+  ## Returns
+    - A stream that yields parsed content strings, filtering out "[DONE]" markers and decode errors.
+
+  ## Example
+
+    ```elixir
+    # Stream tested with Finch, maybe works with other adapters.
+    Application.put_env(:llm_composer, :tesla_adapter, {Tesla.Adapter.Finch, name: MyFinch})
+    {:ok, finch} = Finch.start_link(name: MyFinch)
+
+    settings = %LlmComposer.Settings{
+      provider: LlmComposer.Providers.Ollama,
+      provider_opts: [model: "llama3.2"],
+      stream_response: true
+    }
+
+    messages = [
+      %LlmComposer.Message{type: :user, content: "Tell me a short story"}
+    ]
+
+    {:ok, res} = LlmComposer.run_completion(settings, messages)
+
+    # Process the stream and print each parsed chunk
+    res.stream
+    |> LlmComposer.parse_stream_response()
+    |> Enum.each(fn parsed_data ->
+      content = get_in(parsed_data, ["message", "content"])
+      if content, do: IO.write(content)
+    end)
+    ```
+  """
+  @spec parse_stream_response(Enumerable.t()) :: Enumerable.t()
+  def parse_stream_response(stream) do
+    stream
+    |> Stream.filter(fn chunk -> chunk != "[DONE]" end)
+    |> Stream.map(fn data ->
+      case Jason.decode(data) do
+        {:ok, parsed} ->
+          parsed
+
+        {:error, _} ->
+          nil
+      end
+    end)
+    |> Stream.filter(fn content -> content != nil and content != "" end)
+  end
+
   @spec user_prompt(Settings.t(), String.t(), map()) :: String.t()
   defp user_prompt(settings, message, opts) do
     prompt = Map.get(opts, :user_prompt_prefix, settings.user_prompt_prefix)

diff --git a/lib/llm_composer/http_client.ex b/lib/llm_composer/http_client.ex
@@ -0,0 +1,53 @@
+defmodule LlmComposer.HttpClient do
+  @moduledoc """
+  Helper mod for setup the Tesla http client and its options
+  """
+
+  @default_timeout 50_000
+
+  @spec client(binary(), keyword()) :: Tesla.Client.t()
+  def client(base_url, opts \\ []) do
+    base_url
+    |> middlewares(opts)
+    |> Tesla.client(adapter())
+  end
+
+  @spec adapter() :: term()
+  defp adapter do
+    Application.get_env(:llm_composer, :tesla_adapter)
+  end
+
+  @spec middlewares(binary(), keyword()) :: list(term())
+  defp middlewares(base_url, opts) do
+    stream = Keyword.get(opts, :stream_response)
+
+    resp = [
+      {
+        Tesla.Middleware.BaseUrl,
+        base_url
+      },
+      Tesla.Middleware.JSON
+    ]
+
+    if stream do
+      resp ++ [{Tesla.Middleware.SSE, only: :data}]
+    else
+      resp ++
+        [
+          {Tesla.Middleware.Retry,
+           delay: :timer.seconds(1),
+           max_delay: :timer.seconds(10),
+           max_retries: 10,
+           should_retry: fn
+             {:ok, %{status: status}} when status in [429, 500, 503] -> true
+             {:error, :closed} -> true
+             _other -> false
+           end},
+          {Tesla.Middleware.Timeout,
+           timeout:
+             Application.get_env(:llm_composer, :timeout) ||
+               Keyword.get(opts, :default_timeout, @default_timeout)}
+        ]
+    end
+  end
+end
diff --git a/lib/llm_composer/llm_response.ex b/lib/llm_composer/llm_response.ex
@@ -11,11 +11,12 @@ defmodule LlmComposer.LlmResponse do
   @type t() :: %__MODULE__{
           actions: [[FunctionCall.t()]] | [FunctionCall.t()],
           input_tokens: pos_integer() | nil,
-          main_response: Message.t(),
+          main_response: Message.t() | nil,
           output_tokens: pos_integer() | nil,
           previous_response: map() | nil,
           raw: map(),
-          status: :ok | :error
+          status: :ok | :error,
+          stream: nil | Enum.t()
         }
 
   defstruct [
@@ -25,7 +26,8 @@ defmodule LlmComposer.LlmResponse do
     :output_tokens,
     :previous_response,
     :raw,
-    :status
+    :status,
+    :stream
   ]
 
   @type model_response :: Tesla.Env.result()
@@ -41,6 +43,24 @@ defmodule LlmComposer.LlmResponse do
     {:error, resp}
   end
 
+  # Stream response case
+  def new(
+        {status, %{response: stream}} = raw_response,
+        llm_model
+      )
+      when llm_model in [:open_ai, :open_router] and is_function(stream) do
+    {:ok,
+     %__MODULE__{
+       actions: [],
+       input_tokens: nil,
+       output_tokens: nil,
+       stream: stream,
+       main_response: nil,
+       raw: raw_response,
+       status: status
+     }}
+  end
+
   def new(
         {status,
          %{actions: actions, response: %{"choices" => [first_choice | _]} = raw_response}},
@@ -65,6 +85,24 @@ defmodule LlmComposer.LlmResponse do
      }}
   end
 
+  # Stream response case for Ollama
+  def new(
+        {status, %{response: stream}} = raw_response,
+        :ollama
+      )
+      when is_function(stream) do
+    {:ok,
+     %__MODULE__{
+       actions: [],
+       input_tokens: nil,
+       output_tokens: nil,
+       stream: stream,
+       main_response: nil,
+       raw: raw_response,
+       status: status
+     }}
+  end
+
   def new(
         {status, %{actions: actions, response: %{"message" => message} = raw_response}},
         :ollama

diff --git a/lib/llm_composer/providers/ollama.ex b/lib/llm_composer/providers/ollama.ex
@@ -6,28 +6,12 @@ defmodule LlmComposer.Providers.Ollama do
   """
   @behaviour LlmComposer.Provider
 
-  use Tesla
-
+  alias LlmComposer.HttpClient
   alias LlmComposer.LlmResponse
   alias LlmComposer.Providers.Utils
 
   @uri Application.compile_env(:llm_composer, :ollama_uri, "http://localhost:11434")
 
-  plug(Tesla.Middleware.BaseUrl, @uri)
-
-  plug(Tesla.Middleware.JSON)
-
-  plug(Tesla.Middleware.Retry,
-    delay: :timer.seconds(1),
-    max_delay: :timer.seconds(10),
-    max_retries: 5,
-    should_retry: fn
-      {:ok, %{status: status}} when status in [429, 500, 503] -> true
-      {:error, :closed} -> true
-      _other -> false
-    end
-  )
-
   @impl LlmComposer.Provider
   def name, do: :ollama
 
@@ -37,11 +21,13 @@ defmodule LlmComposer.Providers.Ollama do
   """
   def run(messages, system_message, opts) do
     model = Keyword.get(opts, :model)
+    client = HttpClient.client(@uri, opts)
+    req_opts = Utils.get_req_opts(opts)
 
     if model do
       messages
       |> build_request(system_message, model, opts)
-      |> then(&post("/api/chat", &1))
+      |> then(&Tesla.post(client, "/api/chat", &1, opts: req_opts))
       |> handle_response()
       |> LlmResponse.new(name())
     else
@@ -52,7 +38,7 @@ defmodule LlmComposer.Providers.Ollama do
   defp build_request(messages, system_message, model, opts) do
     base_request = %{
       model: model,
-      stream: false,
+      stream: Keyword.get(opts, :stream_response, false),
       # tools: get_tools(Keyword.get(opts, :functions)),
       messages: Utils.map_messages([system_message | messages])
     }