Skip to content

Commit 2c219e0

Browse files
Stream responses (#29)
* stream responses for openai and openrouter * default config * fix credo * stream read response from ollama * readme and function in llm_composer mod * more info in README * added note * fix example * a bit more doc * lower finch min * updated min version of Tesla because fix in finch * fix in openrouter * Update lib/llm_composer/http_client.ex Co-authored-by: Hector Perez <hecpeare@gmail.com> * Update README.md Co-authored-by: Hector Perez <hecpeare@gmail.com> --------- Co-authored-by: Hector Perez <hecpeare@gmail.com>
1 parent dd2146a commit 2c219e0

File tree

12 files changed

+279
-84
lines changed

12 files changed

+279
-84
lines changed

README.md

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,26 @@ def deps do
1515
end
1616
```
1717

18+
## Provider Compatibility
19+
20+
The following table shows which features are supported by each provider:
21+
22+
| Feature | OpenAI | OpenRouter | Ollama | Bedrock |
23+
|---------|--------|------------|--------|---------|
24+
| Basic Chat |||||
25+
| Streaming |||||
26+
| Function Calls |||||
27+
| Auto Function Execution |||||
28+
| Fallback Models |||||
29+
| Provider Routing |||||
30+
31+
### Notes:
32+
- **OpenRouter** offers the most comprehensive feature set, including unique capabilities like fallback models and provider routing
33+
- **Bedrock** support is provided via AWS ExAws integration and requires proper AWS configuration
34+
- **Ollama** requires an ollama server instance to be running
35+
- **Function Calls** require the provider to support OpenAI-compatible function calling format
36+
- **Streaming** is **not** compatible with Tesla **retries**.
37+
1838
## Usage
1939

2040
### Simple Bot Definition
@@ -150,7 +170,65 @@ LlmComposer.Message.new(
150170
)
151171
```
152172

153-
No function calls support in Ollama (for now)
173+
**Note:** Ollama does not provide token usage information, so `input_tokens` and `output_tokens` will always be empty in debug logs and response metadata. Function calls are also not supported with Ollama.
174+
175+
### Streaming Responses
176+
177+
LlmComposer supports streaming responses for real-time output, which is particularly useful for long-form content generation. This feature works with providers that support streaming (like Ollama, OpenRouter and OpenAI).
178+
179+
```elixir
180+
# Make sure to configure Tesla adapter for streaming (Finch recommended)
181+
Application.put_env(:llm_composer, :tesla_adapter, {Tesla.Adapter.Finch, name: MyFinch})
182+
{:ok, finch} = Finch.start_link(name: MyFinch)
183+
184+
defmodule MyStreamingChat do
185+
@settings %LlmComposer.Settings{
186+
provider: LlmComposer.Providers.Ollama,
187+
provider_opts: [model: "llama3.2"],
188+
system_prompt: "You are a creative storyteller.",
189+
stream_response: true
190+
}
191+
192+
def run_streaming_chat() do
193+
messages = [
194+
%LlmComposer.Message{type: :user, content: "Tell me a short story about space exploration"}
195+
]
196+
197+
{:ok, res} = LlmComposer.run_completion(@settings, messages)
198+
199+
# Process the stream and output content in real-time
200+
res.stream
201+
|> LlmComposer.parse_stream_response()
202+
|> Enum.each(fn parsed_data ->
203+
content = get_in(parsed_data, ["message", "content"]) || ""
204+
if content != "", do: IO.write(content)
205+
end)
206+
207+
IO.puts("\n--- Stream complete ---")
208+
end
209+
end
210+
211+
MyStreamingChat.run_streaming_chat()
212+
```
213+
214+
Example of execution:
215+
216+
```
217+
mix run streaming_sample.ex
218+
219+
Once upon a time, in the vast expanse of space, a brave astronaut embarked on a journey to explore distant galaxies. The stars shimmered as the spaceship soared beyond the known universe, uncovering secrets of the cosmos...
220+
221+
--- Stream complete ---
222+
```
223+
224+
**Note:** The `stream_response: true` setting enables streaming mode, and `parse_stream_response/1` filters and parses the raw stream data into usable content chunks.
225+
226+
**Important:** When using Stream read chat completion, LlmComposer does not track input/output/cache/thinking tokens. There are two approaches to handle token counting in this mode:
227+
228+
1. Calculate tokens using libraries like `tiktoken` for OpenAI provider.
229+
2. Read token data from the last stream object if the provider supplies it (currently only OpenRouter supports this).
230+
231+
In Ollama provider, we do not track tokens.
154232

155233
### Using OpenRouter
156234

@@ -334,4 +412,3 @@ In this example, the bot first calls OpenAI to understand the user's intent and
334412
Documentation can be generated with [ExDoc](https://github.com/elixir-lang/ex_doc)
335413
and published on [HexDocs](https://hexdocs.pm). Once published, the docs can
336414
be found at <https://hexdocs.pm/llm_composer>.
337-

config/config.exs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ config :llm_composer,
44
openai_key: "",
55
ollama_uri: "http://localhost:11434",
66
open_router_key: "",
7+
tesla_adapter: nil,
78
timeout: nil
89

910
import_config "#{Mix.env()}.exs"

lib/llm_composer.ex

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ defmodule LlmComposer do
8787
provider_opts =
8888
Keyword.merge(settings.provider_opts,
8989
functions: settings.functions,
90+
stream_response: settings.stream_response,
9091
api_key: settings.api_key
9192
)
9293

@@ -110,6 +111,59 @@ defmodule LlmComposer do
110111
end)
111112
end
112113

114+
@doc """
115+
Processes a raw stream response and returns a parsed stream of message content.
116+
117+
## Parameters
118+
- `stream`: The raw stream object from the LLM response.
119+
120+
## Returns
121+
- A stream that yields parsed content strings, filtering out "[DONE]" markers and decode errors.
122+
123+
## Example
124+
125+
```elixir
126+
# Stream tested with Finch, maybe works with other adapters.
127+
Application.put_env(:llm_composer, :tesla_adapter, {Tesla.Adapter.Finch, name: MyFinch})
128+
{:ok, finch} = Finch.start_link(name: MyFinch)
129+
130+
settings = %LlmComposer.Settings{
131+
provider: LlmComposer.Providers.Ollama,
132+
provider_opts: [model: "llama3.2"],
133+
stream_response: true
134+
}
135+
136+
messages = [
137+
%LlmComposer.Message{type: :user, content: "Tell me a short story"}
138+
]
139+
140+
{:ok, res} = LlmComposer.run_completion(settings, messages)
141+
142+
# Process the stream and print each parsed chunk
143+
res.stream
144+
|> LlmComposer.parse_stream_response()
145+
|> Enum.each(fn parsed_data ->
146+
content = get_in(parsed_data, ["message", "content"])
147+
if content, do: IO.write(content)
148+
end)
149+
```
150+
"""
151+
@spec parse_stream_response(Enumerable.t()) :: Enumerable.t()
152+
def parse_stream_response(stream) do
153+
stream
154+
|> Stream.filter(fn chunk -> chunk != "[DONE]" end)
155+
|> Stream.map(fn data ->
156+
case Jason.decode(data) do
157+
{:ok, parsed} ->
158+
parsed
159+
160+
{:error, _} ->
161+
nil
162+
end
163+
end)
164+
|> Stream.filter(fn content -> content != nil and content != "" end)
165+
end
166+
113167
@spec user_prompt(Settings.t(), String.t(), map()) :: String.t()
114168
defp user_prompt(settings, message, opts) do
115169
prompt = Map.get(opts, :user_prompt_prefix, settings.user_prompt_prefix)

lib/llm_composer/http_client.ex

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
defmodule LlmComposer.HttpClient do
2+
@moduledoc """
3+
Helper mod for setup the Tesla http client and its options
4+
"""
5+
6+
@default_timeout 50_000
7+
8+
@spec client(binary(), keyword()) :: Tesla.Client.t()
9+
def client(base_url, opts \\ []) do
10+
base_url
11+
|> middlewares(opts)
12+
|> Tesla.client(adapter())
13+
end
14+
15+
@spec adapter() :: term()
16+
defp adapter do
17+
Application.get_env(:llm_composer, :tesla_adapter)
18+
end
19+
20+
@spec middlewares(binary(), keyword()) :: list(term())
21+
defp middlewares(base_url, opts) do
22+
stream = Keyword.get(opts, :stream_response)
23+
24+
resp = [
25+
{
26+
Tesla.Middleware.BaseUrl,
27+
base_url
28+
},
29+
Tesla.Middleware.JSON
30+
]
31+
32+
if stream do
33+
resp ++ [{Tesla.Middleware.SSE, only: :data}]
34+
else
35+
resp ++
36+
[
37+
{Tesla.Middleware.Retry,
38+
delay: :timer.seconds(1),
39+
max_delay: :timer.seconds(10),
40+
max_retries: 10,
41+
should_retry: fn
42+
{:ok, %{status: status}} when status in [429, 500, 503] -> true
43+
{:error, :closed} -> true
44+
_other -> false
45+
end},
46+
{Tesla.Middleware.Timeout,
47+
timeout:
48+
Application.get_env(:llm_composer, :timeout) ||
49+
Keyword.get(opts, :default_timeout, @default_timeout)}
50+
]
51+
end
52+
end
53+
end

lib/llm_composer/llm_response.ex

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ defmodule LlmComposer.LlmResponse do
1111
@type t() :: %__MODULE__{
1212
actions: [[FunctionCall.t()]] | [FunctionCall.t()],
1313
input_tokens: pos_integer() | nil,
14-
main_response: Message.t(),
14+
main_response: Message.t() | nil,
1515
output_tokens: pos_integer() | nil,
1616
previous_response: map() | nil,
1717
raw: map(),
18-
status: :ok | :error
18+
status: :ok | :error,
19+
stream: nil | Enum.t()
1920
}
2021

2122
defstruct [
@@ -25,7 +26,8 @@ defmodule LlmComposer.LlmResponse do
2526
:output_tokens,
2627
:previous_response,
2728
:raw,
28-
:status
29+
:status,
30+
:stream
2931
]
3032

3133
@type model_response :: Tesla.Env.result()
@@ -41,6 +43,24 @@ defmodule LlmComposer.LlmResponse do
4143
{:error, resp}
4244
end
4345

46+
# Stream response case
47+
def new(
48+
{status, %{response: stream}} = raw_response,
49+
llm_model
50+
)
51+
when llm_model in [:open_ai, :open_router] and is_function(stream) do
52+
{:ok,
53+
%__MODULE__{
54+
actions: [],
55+
input_tokens: nil,
56+
output_tokens: nil,
57+
stream: stream,
58+
main_response: nil,
59+
raw: raw_response,
60+
status: status
61+
}}
62+
end
63+
4464
def new(
4565
{status,
4666
%{actions: actions, response: %{"choices" => [first_choice | _]} = raw_response}},
@@ -65,6 +85,24 @@ defmodule LlmComposer.LlmResponse do
6585
}}
6686
end
6787

88+
# Stream response case for Ollama
89+
def new(
90+
{status, %{response: stream}} = raw_response,
91+
:ollama
92+
)
93+
when is_function(stream) do
94+
{:ok,
95+
%__MODULE__{
96+
actions: [],
97+
input_tokens: nil,
98+
output_tokens: nil,
99+
stream: stream,
100+
main_response: nil,
101+
raw: raw_response,
102+
status: status
103+
}}
104+
end
105+
68106
def new(
69107
{status, %{actions: actions, response: %{"message" => message} = raw_response}},
70108
:ollama

lib/llm_composer/providers/ollama.ex

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,12 @@ defmodule LlmComposer.Providers.Ollama do
66
"""
77
@behaviour LlmComposer.Provider
88

9-
use Tesla
10-
9+
alias LlmComposer.HttpClient
1110
alias LlmComposer.LlmResponse
1211
alias LlmComposer.Providers.Utils
1312

1413
@uri Application.compile_env(:llm_composer, :ollama_uri, "http://localhost:11434")
1514

16-
plug(Tesla.Middleware.BaseUrl, @uri)
17-
18-
plug(Tesla.Middleware.JSON)
19-
20-
plug(Tesla.Middleware.Retry,
21-
delay: :timer.seconds(1),
22-
max_delay: :timer.seconds(10),
23-
max_retries: 5,
24-
should_retry: fn
25-
{:ok, %{status: status}} when status in [429, 500, 503] -> true
26-
{:error, :closed} -> true
27-
_other -> false
28-
end
29-
)
30-
3115
@impl LlmComposer.Provider
3216
def name, do: :ollama
3317

@@ -37,11 +21,13 @@ defmodule LlmComposer.Providers.Ollama do
3721
"""
3822
def run(messages, system_message, opts) do
3923
model = Keyword.get(opts, :model)
24+
client = HttpClient.client(@uri, opts)
25+
req_opts = Utils.get_req_opts(opts)
4026

4127
if model do
4228
messages
4329
|> build_request(system_message, model, opts)
44-
|> then(&post("/api/chat", &1))
30+
|> then(&Tesla.post(client, "/api/chat", &1, opts: req_opts))
4531
|> handle_response()
4632
|> LlmResponse.new(name())
4733
else
@@ -52,7 +38,7 @@ defmodule LlmComposer.Providers.Ollama do
5238
defp build_request(messages, system_message, model, opts) do
5339
base_request = %{
5440
model: model,
55-
stream: false,
41+
stream: Keyword.get(opts, :stream_response, false),
5642
# tools: get_tools(Keyword.get(opts, :functions)),
5743
messages: Utils.map_messages([system_message | messages])
5844
}

0 commit comments

Comments
 (0)