Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 37dbd48

Browse files
authored
FIX: implement max_output tokens (anthropic/openai/bedrock/gemini/open router) (#1447)
* FIX: implement max_output tokens (anthropic/openai/bedrock/gemini/open router) Previously this feature existed but was not implemented Also updates a bunch of models to in our preset to point to latest * implementing in base is safer, simpler and easier to manage * anthropic 3.5 is getting older, lets use 4.0 here and fix spec
1 parent 3e87e92 commit 37dbd48

File tree

11 files changed

+171
-34
lines changed

11 files changed

+171
-34
lines changed

assets/javascripts/discourse/components/ai-llms-list-editor.gjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ export default class AiLlmsListEditor extends Component {
3232
key = `${llm.provider}-${llm.name}`;
3333
} else {
3434
// case of preset
35-
key = llm.id.replace(/\./g, "-");
35+
key = llm.id.replace(/[.:\/]/g, "-");
3636
}
3737

3838
key = `discourse_ai.llms.model_description.${key}`;

config/locales/client.en.yml

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -539,21 +539,26 @@ en:
539539

540540
model_description:
541541
none: "General settings that work for most language models"
542-
anthropic-claude-3-7-sonnet: "Anthropic's most intelligent model"
543-
anthropic-claude-3-5-haiku: "Fast and cost-effective"
544-
anthropic-claude-3-opus: "Excels at writing and complex tasks"
545-
google-gemini-2-5-pro: "Mid-sized multimodal model capable of a wide range of tasks"
546-
google-gemini-2-0-flash: "Lightweight, fast, and cost-efficient with multimodal reasoning"
542+
anthropic-claude-opus-4-0: "Anthropic's most intelligent model"
543+
anthropic-claude-sonnet-4-0: "Optimal balance of speed and cost"
544+
anthropic-claude-3-7-sonnet-latest: "Optimal balance of speed and cost (previous generation)"
545+
anthropic-claude-3-5-haiku-latest: "Fast and cost-effective"
546+
google-gemini-2-5-pro: "Large multimodal model capable of a wide range of tasks"
547+
google-gemini-2-0-flash: "Lightweight, fast, and cost-efficient with multimodal reasoning (previous generation)"
548+
google-gemini-2-5-flash: "Lightweight, fast, and cost-efficient with multimodal reasoning"
547549
google-gemini-2-0-flash-lite: "Cost efficient and low latency model"
548-
open_ai-o1: "Open AI's most capable reasoning model"
549-
open_ai-o3-mini: "Advanced Cost-efficient reasoning model"
550+
open_ai-o3: "Open AI's most capable reasoning model"
551+
open_ai-o4-mini: "Advanced Cost-efficient reasoning model"
550552
open_ai-gpt-4-1: "Open AI's flagship model. It is well suited for problem solving across domains"
551553
open_ai-gpt-4-1-mini: "Provides a balance between intelligence, speed, and cost that makes it an attractive model for many use cases."
552554
open_ai-gpt-4-1-nano: "Fastest, most cost-effective GPT-4.1 model."
553555
samba_nova-Meta-Llama-3-1-8B-Instruct: "Efficient lightweight multilingual model"
554556
samba_nova-Meta-Llama-3-3-70B-Instruct": "Powerful multipurpose model"
555557
mistral-mistral-large-latest: "Mistral's most powerful model"
556558
mistral-pixtral-large-latest: "Mistral's most powerful vision capable model"
559+
open_router-x-ai-grok-3-beta: "xAI's latest model"
560+
open_router-deepseek-deepseek-r1-0528-free: "DeepSeek's latest reasoning model"
561+
open_router-meta-llama-3-3-70b-instruct: "Highly capable multilingual model"
557562

558563
preseeded_model_description: "Pre-configured open-source model utilizing %{model}"
559564

lib/completions/endpoints/anthropic.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ def default_options(dialect)
3131
"claude-3-opus-20240229"
3232
when "claude-3-5-sonnet"
3333
"claude-3-5-sonnet-latest"
34+
when "claude-3-7-sonnet"
35+
"claude-3-7-sonnet-latest"
36+
when "claude-4-opus"
37+
"claude-4-opus-20250514"
38+
when "claude-4-sonnet"
39+
"claude-4-sonnet-20250514"
3440
else
3541
llm_model.name
3642
end
@@ -92,7 +98,6 @@ def prepare_payload(prompt, model_params, dialect)
9298
default_options(dialect).merge(model_params.except(:response_format)).merge(
9399
messages: prompt.messages,
94100
)
95-
96101
payload[:system] = prompt.system_prompt if prompt.system_prompt.present?
97102
payload[:stream] = true if @streaming_mode
98103

lib/completions/endpoints/aws_bedrock.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def prepare_payload(prompt, model_params, dialect)
120120
default_options(dialect).merge(model_params.except(:response_format)).merge(
121121
messages: prompt.messages,
122122
)
123+
123124
payload[:system] = prompt.system_prompt if prompt.system_prompt.present?
124125

125126
prefilled_message = +""

lib/completions/endpoints/base.rb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,14 @@ def initialize(llm_model)
4848
@llm_model = llm_model
4949
end
5050

51+
def enforce_max_output_tokens(value)
52+
if @llm_model.max_output_tokens.to_i > 0
53+
value = @llm_model.max_output_tokens if (value.to_i > @llm_model.max_output_tokens) ||
54+
(value.to_i <= 0)
55+
end
56+
value
57+
end
58+
5159
def use_ssl?
5260
if model_uri&.scheme.present?
5361
model_uri.scheme == "https"
@@ -83,6 +91,8 @@ def perform_completion!(
8391
@partial_tool_calls = partial_tool_calls
8492
@output_thinking = output_thinking
8593

94+
max_tokens = enforce_max_output_tokens(model_params[:max_tokens])
95+
model_params[:max_tokens] = max_tokens if max_tokens
8696
model_params = normalize_model_params(model_params)
8797
orig_blk = blk
8898

lib/completions/endpoints/gemini.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def prepare_payload(prompt, model_params, dialect)
6363
tools = dialect.tools if @native_tool_support
6464

6565
payload = default_options.merge(contents: prompt[:messages])
66+
6667
payload[:systemInstruction] = {
6768
role: "system",
6869
parts: [{ text: prompt[:system_instruction].to_s }],

lib/completions/llm.rb

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -27,25 +27,33 @@ def presets
2727
id: "anthropic",
2828
models: [
2929
{
30-
name: "claude-3-7-sonnet",
30+
name: "claude-3-7-sonnet-latest",
3131
tokens: 200_000,
3232
display_name: "Claude 3.7 Sonnet",
3333
input_cost: 3,
3434
cached_input_cost: 0.30,
3535
output_cost: 15,
3636
},
3737
{
38-
name: "claude-3-5-haiku",
38+
name: "claude-sonnet-4-0",
39+
tokens: 200_000,
40+
display_name: "Claude 4 Sonnet",
41+
input_cost: 3,
42+
cached_input_cost: 0.30,
43+
output_cost: 15,
44+
},
45+
{
46+
name: "claude-3-5-haiku-latest",
3947
tokens: 200_000,
4048
display_name: "Claude 3.5 Haiku",
4149
input_cost: 0.80,
4250
cached_input_cost: 0.08,
4351
output_cost: 4,
4452
},
4553
{
46-
name: "claude-3-opus",
54+
name: "claude-opus-4-0",
4755
tokens: 200_000,
48-
display_name: "Claude 3 Opus",
56+
display_name: "Claude 4 Opus",
4957
input_cost: 15,
5058
cached_input_cost: 1.50,
5159
output_cost: 75,
@@ -62,15 +70,28 @@ def presets
6270
name: "gemini-2.5-pro",
6371
tokens: 800_000,
6472
endpoint:
65-
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro-preview-03-25",
73+
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro",
74+
display_name: "Gemini 2.5 Pro",
75+
input_cost: 1.25,
76+
oputput_cost: 10.0,
77+
},
78+
{
79+
name: "gemini-2.5-flash",
80+
tokens: 800_000,
81+
endpoint:
82+
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash",
6683
display_name: "Gemini 2.5 Pro",
84+
input_cost: 0.30,
85+
output_cost: 2.50,
6786
},
6887
{
6988
name: "gemini-2.0-flash",
7089
tokens: 800_000,
7190
endpoint:
7291
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash",
7392
display_name: "Gemini 2.0 Flash",
93+
input_cost: 0.10,
94+
output_cost: 0.40,
7495
},
7596
{
7697
name: "gemini-2.0-flash-lite",
@@ -89,20 +110,20 @@ def presets
89110
id: "open_ai",
90111
models: [
91112
{
92-
name: "o3-mini",
113+
name: "o4-mini",
93114
tokens: 200_000,
94-
display_name: "o3 Mini",
115+
display_name: "o4 Mini",
95116
input_cost: 1.10,
96-
cached_input_cost: 0.55,
117+
cached_input_cost: 0.275,
97118
output_cost: 4.40,
98119
},
99120
{
100-
name: "o1",
121+
name: "o3",
101122
tokens: 200_000,
102-
display_name: "o1",
103-
input_cost: 15,
104-
cached_input_cost: 7.50,
105-
output_cost: 60,
123+
display_name: "o3",
124+
input_cost: 2,
125+
cached_input_cost: 0.5,
126+
output_cost: 8,
106127
},
107128
{
108129
name: "gpt-4.1",
@@ -177,14 +198,23 @@ def presets
177198
id: "open_router",
178199
models: [
179200
{
180-
name: "meta-llama/llama-3.3-70b-instruct",
181-
tokens: 128_000,
182-
display_name: "Llama 3.3 70B",
201+
name: "x-ai/grok-3-beta",
202+
tokens: 131_072,
203+
display_name: "xAI Grok 3 Beta",
204+
input_cost: 3,
205+
output_cost: 15,
206+
},
207+
{
208+
name: "deepseek/deepseek-r1-0528:free",
209+
tokens: 163_000,
210+
display_name: "DeepSeek R1 0528 - free",
183211
},
184212
{
185-
name: "google/gemini-flash-1.5-exp",
186-
tokens: 1_000_000,
187-
display_name: "Gemini Flash 1.5 Exp",
213+
name: "meta-llama/llama-3.3-70b-instruct",
214+
tokens: 131_072,
215+
display_name: "Llama 3.3 70B Instruct",
216+
input_cost: 0.05,
217+
output_cost: 0.25,
188218
},
189219
],
190220
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer,

spec/lib/completions/endpoints/anthropic_spec.rb

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,51 @@
665665
expect(log.response_tokens).to eq(30)
666666
end
667667

668+
describe "max output tokens" do
669+
it "it respects max output tokens supplied to model unconditionally, even with thinking" do
670+
model.update!(
671+
provider_params: {
672+
enable_reasoning: true,
673+
reasoning_tokens: 1000,
674+
},
675+
max_output_tokens: 2000,
676+
)
677+
678+
parsed_body = nil
679+
stub_request(:post, url).with(
680+
body:
681+
proc do |req_body|
682+
parsed_body = JSON.parse(req_body, symbolize_names: true)
683+
true
684+
end,
685+
headers: {
686+
"Content-Type" => "application/json",
687+
"X-Api-Key" => "123",
688+
"Anthropic-Version" => "2023-06-01",
689+
},
690+
).to_return(
691+
status: 200,
692+
body: {
693+
id: "msg_123",
694+
type: "message",
695+
role: "assistant",
696+
content: [{ type: "text", text: "test response" }],
697+
model: "claude-3-opus-20240229",
698+
usage: {
699+
input_tokens: 10,
700+
output_tokens: 5,
701+
},
702+
}.to_json,
703+
)
704+
705+
llm.generate(prompt, user: Discourse.system_user, max_tokens: 2500)
706+
expect(parsed_body[:max_tokens]).to eq(2000)
707+
708+
llm.generate(prompt, user: Discourse.system_user)
709+
expect(parsed_body[:max_tokens]).to eq(2000)
710+
end
711+
end
712+
668713
describe "parameter disabling" do
669714
it "excludes disabled parameters from the request" do
670715
model.update!(provider_params: { disable_top_p: true, disable_temperature: true })

spec/lib/completions/endpoints/gemini_spec.rb

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,40 @@ def tool_response
179179
expect(parsed.dig(:generationConfig, :thinkingConfig)).to eq({ thinkingBudget: 10_000 })
180180
end
181181

182+
it "correctly handles max output tokens" do
183+
model.update!(max_output_tokens: 1000)
184+
185+
response = gemini_mock.response("some response mode").to_json
186+
187+
req_body = nil
188+
189+
llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
190+
url = "#{model.url}:generateContent?key=123"
191+
192+
stub_request(:post, url).with(
193+
body:
194+
proc do |_req_body|
195+
req_body = _req_body
196+
true
197+
end,
198+
).to_return(status: 200, body: response)
199+
200+
response = llm.generate("Hello", user: user, max_tokens: 10_000)
201+
parsed = JSON.parse(req_body, symbolize_names: true)
202+
203+
expect(parsed.dig(:generationConfig, :maxOutputTokens)).to eq(1000)
204+
205+
response = llm.generate("Hello", user: user, max_tokens: 50)
206+
parsed = JSON.parse(req_body, symbolize_names: true)
207+
208+
expect(parsed.dig(:generationConfig, :maxOutputTokens)).to eq(50)
209+
210+
response = llm.generate("Hello", user: user)
211+
parsed = JSON.parse(req_body, symbolize_names: true)
212+
213+
expect(parsed.dig(:generationConfig, :maxOutputTokens)).to eq(1000)
214+
end
215+
182216
it "clamps thinking tokens within allowed limits" do
183217
model.update!(provider_params: { enable_thinking: "true", thinking_tokens: "30000" })
184218

@@ -551,7 +585,7 @@ def tool_response
551585
data: {"candidates": [{"content": {"parts": [{"text": "\\","}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
552586
553587
data: {"candidates": [{"content": {"parts": [{"text": "\\""}],"role": "model"}}],"usageMetadata": {"promptTokenCount": 399,"totalTokenCount": 399},"modelVersion": "gemini-1.5-pro-002"}
554-
588+
555589
data: {"candidates": [{"content": {"parts": [{"text": "num"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}
556590
557591
data: {"candidates": [{"content": {"parts": [{"text": "\\":"}],"role": "model"},"safetyRatings": [{"category": "HARM_CATEGORY_HATE_SPEECH","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_DANGEROUS_CONTENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HARASSMENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT","probability": "NEGLIGIBLE"}]}],"usageMetadata": {"promptTokenCount": 399,"totalTokenCount": 399},"modelVersion": "gemini-1.5-pro-002"}

spec/lib/completions/endpoints/open_ai_spec.rb

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def request_body(prompt, stream: false, tool_call: false)
173173

174174
describe "max tokens for reasoning models" do
175175
it "uses max_completion_tokens for reasoning models" do
176-
model.update!(name: "o3-mini")
176+
model.update!(name: "o3-mini", max_output_tokens: 999)
177177
llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
178178
prompt =
179179
DiscourseAi::Completions::Prompt.new(
@@ -201,7 +201,13 @@ def request_body(prompt, stream: false, tool_call: false)
201201
llm.generate(prompt, user: user, max_tokens: 1000) { |chunk| result << chunk }
202202

203203
expect(result).to eq("hello")
204-
expect(body_parsed["max_completion_tokens"]).to eq(1000)
204+
expect(body_parsed["max_completion_tokens"]).to eq(999)
205+
206+
llm.generate(prompt, user: user, max_tokens: 100) { |chunk| result << chunk }
207+
expect(body_parsed["max_completion_tokens"]).to eq(100)
208+
209+
llm.generate(prompt, user: user) { |chunk| result << chunk }
210+
expect(body_parsed["max_completion_tokens"]).to eq(999)
205211
end
206212
end
207213

0 commit comments

Comments
 (0)