Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export default class AiLlmsListEditor extends Component {
key = `${llm.provider}-${llm.name}`;
} else {
// case of preset
key = llm.id.replace(/\./g, "-");
key = llm.id.replace(/[.:\/]/g, "-");
}

key = `discourse_ai.llms.model_description.${key}`;
Expand Down
19 changes: 12 additions & 7 deletions config/locales/client.en.yml
Original file line number Diff line number Diff line change
Expand Up @@ -539,21 +539,26 @@ en:

model_description:
none: "General settings that work for most language models"
anthropic-claude-3-7-sonnet: "Anthropic's most intelligent model"
anthropic-claude-3-5-haiku: "Fast and cost-effective"
anthropic-claude-3-opus: "Excels at writing and complex tasks"
google-gemini-2-5-pro: "Mid-sized multimodal model capable of a wide range of tasks"
google-gemini-2-0-flash: "Lightweight, fast, and cost-efficient with multimodal reasoning"
anthropic-claude-opus-4-0: "Anthropic's most intelligent model"
anthropic-claude-sonnet-4-0: "Optimal balance of speed and cost"
anthropic-claude-3-7-sonnet-latest: "Optimal balance of speed and cost (previous generation)"
anthropic-claude-3-5-haiku-latest: "Fast and cost-effective"
google-gemini-2-5-pro: "Large multimodal model capable of a wide range of tasks"
google-gemini-2-0-flash: "Lightweight, fast, and cost-efficient with multimodal reasoning (previous generation)"
google-gemini-2-5-flash: "Lightweight, fast, and cost-efficient with multimodal reasoning"
google-gemini-2-0-flash-lite: "Cost efficient and low latency model"
open_ai-o1: "Open AI's most capable reasoning model"
open_ai-o3-mini: "Advanced Cost-efficient reasoning model"
open_ai-o3: "Open AI's most capable reasoning model"
open_ai-o4-mini: "Advanced Cost-efficient reasoning model"
open_ai-gpt-4-1: "Open AI's flagship model. It is well suited for problem solving across domains"
open_ai-gpt-4-1-mini: "Provides a balance between intelligence, speed, and cost that makes it an attractive model for many use cases."
open_ai-gpt-4-1-nano: "Fastest, most cost-effective GPT-4.1 model."
samba_nova-Meta-Llama-3-1-8B-Instruct: "Efficient lightweight multilingual model"
samba_nova-Meta-Llama-3-3-70B-Instruct": "Powerful multipurpose model"
mistral-mistral-large-latest: "Mistral's most powerful model"
mistral-pixtral-large-latest: "Mistral's most powerful vision capable model"
open_router-x-ai-grok-3-beta: "xAI's latest model"
open_router-deepseek-deepseek-r1-0528-free: "DeepSeek's latest reasoning model"
open_router-meta-llama-3-3-70b-instruct: "Highly capable multilingual model"

preseeded_model_description: "Pre-configured open-source model utilizing %{model}"

Expand Down
7 changes: 6 additions & 1 deletion lib/completions/endpoints/anthropic.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ def default_options(dialect)
"claude-3-opus-20240229"
when "claude-3-5-sonnet"
"claude-3-5-sonnet-latest"
when "claude-3-7-sonnet"
"claude-3-7-sonnet-latest"
when "claude-4-opus"
"claude-4-opus-20250514"
when "claude-4-sonnet"
"claude-4-sonnet-20250514"
else
llm_model.name
end
Expand Down Expand Up @@ -92,7 +98,6 @@ def prepare_payload(prompt, model_params, dialect)
default_options(dialect).merge(model_params.except(:response_format)).merge(
messages: prompt.messages,
)

payload[:system] = prompt.system_prompt if prompt.system_prompt.present?
payload[:stream] = true if @streaming_mode

Expand Down
1 change: 1 addition & 0 deletions lib/completions/endpoints/aws_bedrock.rb
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def prepare_payload(prompt, model_params, dialect)
default_options(dialect).merge(model_params.except(:response_format)).merge(
messages: prompt.messages,
)

payload[:system] = prompt.system_prompt if prompt.system_prompt.present?

prefilled_message = +""
Expand Down
10 changes: 10 additions & 0 deletions lib/completions/endpoints/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ def initialize(llm_model)
@llm_model = llm_model
end

def enforce_max_output_tokens(value)
if @llm_model.max_output_tokens.to_i > 0
value = @llm_model.max_output_tokens if (value.to_i > @llm_model.max_output_tokens) ||
(value.to_i <= 0)
end
value
end

def use_ssl?
if model_uri&.scheme.present?
model_uri.scheme == "https"
Expand Down Expand Up @@ -83,6 +91,8 @@ def perform_completion!(
@partial_tool_calls = partial_tool_calls
@output_thinking = output_thinking

max_tokens = enforce_max_output_tokens(model_params[:max_tokens])
model_params[:max_tokens] = max_tokens if max_tokens
model_params = normalize_model_params(model_params)
orig_blk = blk

Expand Down
1 change: 1 addition & 0 deletions lib/completions/endpoints/gemini.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def prepare_payload(prompt, model_params, dialect)
tools = dialect.tools if @native_tool_support

payload = default_options.merge(contents: prompt[:messages])

payload[:systemInstruction] = {
role: "system",
parts: [{ text: prompt[:system_instruction].to_s }],
Expand Down
68 changes: 49 additions & 19 deletions lib/completions/llm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,33 @@ def presets
id: "anthropic",
models: [
{
name: "claude-3-7-sonnet",
name: "claude-3-7-sonnet-latest",
tokens: 200_000,
display_name: "Claude 3.7 Sonnet",
input_cost: 3,
cached_input_cost: 0.30,
output_cost: 15,
},
{
name: "claude-3-5-haiku",
name: "claude-sonnet-4-0",
tokens: 200_000,
display_name: "Claude 4 Sonnet",
input_cost: 3,
cached_input_cost: 0.30,
output_cost: 15,
},
{
name: "claude-3-5-haiku-latest",
tokens: 200_000,
display_name: "Claude 3.5 Haiku",
input_cost: 0.80,
cached_input_cost: 0.08,
output_cost: 4,
},
{
name: "claude-3-opus",
name: "claude-opus-4-0",
tokens: 200_000,
display_name: "Claude 3 Opus",
display_name: "Claude 4 Opus",
input_cost: 15,
cached_input_cost: 1.50,
output_cost: 75,
Expand All @@ -62,15 +70,28 @@ def presets
name: "gemini-2.5-pro",
tokens: 800_000,
endpoint:
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro-preview-03-25",
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro",
display_name: "Gemini 2.5 Pro",
input_cost: 1.25,
oputput_cost: 10.0,
},
{
name: "gemini-2.5-flash",
tokens: 800_000,
endpoint:
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash",
display_name: "Gemini 2.5 Pro",
input_cost: 0.30,
output_cost: 2.50,
},
{
name: "gemini-2.0-flash",
tokens: 800_000,
endpoint:
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash",
display_name: "Gemini 2.0 Flash",
input_cost: 0.10,
output_cost: 0.40,
},
{
name: "gemini-2.0-flash-lite",
Expand All @@ -89,20 +110,20 @@ def presets
id: "open_ai",
models: [
{
name: "o3-mini",
name: "o4-mini",
tokens: 200_000,
display_name: "o3 Mini",
display_name: "o4 Mini",
input_cost: 1.10,
cached_input_cost: 0.55,
cached_input_cost: 0.275,
output_cost: 4.40,
},
{
name: "o1",
name: "o3",
tokens: 200_000,
display_name: "o1",
input_cost: 15,
cached_input_cost: 7.50,
output_cost: 60,
display_name: "o3",
input_cost: 2,
cached_input_cost: 0.5,
output_cost: 8,
},
{
name: "gpt-4.1",
Expand Down Expand Up @@ -177,14 +198,23 @@ def presets
id: "open_router",
models: [
{
name: "meta-llama/llama-3.3-70b-instruct",
tokens: 128_000,
display_name: "Llama 3.3 70B",
name: "x-ai/grok-3-beta",
tokens: 131_072,
display_name: "xAI Grok 3 Beta",
input_cost: 3,
output_cost: 15,
},
{
name: "deepseek/deepseek-r1-0528:free",
tokens: 163_000,
display_name: "DeepSeek R1 0528 - free",
},
{
name: "google/gemini-flash-1.5-exp",
tokens: 1_000_000,
display_name: "Gemini Flash 1.5 Exp",
name: "meta-llama/llama-3.3-70b-instruct",
tokens: 131_072,
display_name: "Llama 3.3 70B Instruct",
input_cost: 0.05,
output_cost: 0.25,
},
],
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer,
Expand Down
45 changes: 45 additions & 0 deletions spec/lib/completions/endpoints/anthropic_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,51 @@
expect(log.response_tokens).to eq(30)
end

describe "max output tokens" do
it "it respects max output tokens supplied to model unconditionally, even with thinking" do
model.update!(
provider_params: {
enable_reasoning: true,
reasoning_tokens: 1000,
},
max_output_tokens: 2000,
)

parsed_body = nil
stub_request(:post, url).with(
body:
proc do |req_body|
parsed_body = JSON.parse(req_body, symbolize_names: true)
true
end,
headers: {
"Content-Type" => "application/json",
"X-Api-Key" => "123",
"Anthropic-Version" => "2023-06-01",
},
).to_return(
status: 200,
body: {
id: "msg_123",
type: "message",
role: "assistant",
content: [{ type: "text", text: "test response" }],
model: "claude-3-opus-20240229",
usage: {
input_tokens: 10,
output_tokens: 5,
},
}.to_json,
)

llm.generate(prompt, user: Discourse.system_user, max_tokens: 2500)
expect(parsed_body[:max_tokens]).to eq(2000)

llm.generate(prompt, user: Discourse.system_user)
expect(parsed_body[:max_tokens]).to eq(2000)
end
end

describe "parameter disabling" do
it "excludes disabled parameters from the request" do
model.update!(provider_params: { disable_top_p: true, disable_temperature: true })
Expand Down
36 changes: 35 additions & 1 deletion spec/lib/completions/endpoints/gemini_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,40 @@ def tool_response
expect(parsed.dig(:generationConfig, :thinkingConfig)).to eq({ thinkingBudget: 10_000 })
end

it "correctly handles max output tokens" do
model.update!(max_output_tokens: 1000)

response = gemini_mock.response("some response mode").to_json

req_body = nil

llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
url = "#{model.url}:generateContent?key=123"

stub_request(:post, url).with(
body:
proc do |_req_body|
req_body = _req_body
true
end,
).to_return(status: 200, body: response)

response = llm.generate("Hello", user: user, max_tokens: 10_000)
parsed = JSON.parse(req_body, symbolize_names: true)

expect(parsed.dig(:generationConfig, :maxOutputTokens)).to eq(1000)

response = llm.generate("Hello", user: user, max_tokens: 50)
parsed = JSON.parse(req_body, symbolize_names: true)

expect(parsed.dig(:generationConfig, :maxOutputTokens)).to eq(50)

response = llm.generate("Hello", user: user)
parsed = JSON.parse(req_body, symbolize_names: true)

expect(parsed.dig(:generationConfig, :maxOutputTokens)).to eq(1000)
end

it "clamps thinking tokens within allowed limits" do
model.update!(provider_params: { enable_thinking: "true", thinking_tokens: "30000" })

Expand Down Expand Up @@ -551,7 +585,7 @@ def tool_response
data: {"candidates": [{"content": {"parts": [{"text": "\\","}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}

data: {"candidates": [{"content": {"parts": [{"text": "\\""}],"role": "model"}}],"usageMetadata": {"promptTokenCount": 399,"totalTokenCount": 399},"modelVersion": "gemini-1.5-pro-002"}

data: {"candidates": [{"content": {"parts": [{"text": "num"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"}

data: {"candidates": [{"content": {"parts": [{"text": "\\":"}],"role": "model"},"safetyRatings": [{"category": "HARM_CATEGORY_HATE_SPEECH","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_DANGEROUS_CONTENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HARASSMENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT","probability": "NEGLIGIBLE"}]}],"usageMetadata": {"promptTokenCount": 399,"totalTokenCount": 399},"modelVersion": "gemini-1.5-pro-002"}
Expand Down
10 changes: 8 additions & 2 deletions spec/lib/completions/endpoints/open_ai_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def request_body(prompt, stream: false, tool_call: false)

describe "max tokens for reasoning models" do
it "uses max_completion_tokens for reasoning models" do
model.update!(name: "o3-mini")
model.update!(name: "o3-mini", max_output_tokens: 999)
llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
prompt =
DiscourseAi::Completions::Prompt.new(
Expand Down Expand Up @@ -201,7 +201,13 @@ def request_body(prompt, stream: false, tool_call: false)
llm.generate(prompt, user: user, max_tokens: 1000) { |chunk| result << chunk }

expect(result).to eq("hello")
expect(body_parsed["max_completion_tokens"]).to eq(1000)
expect(body_parsed["max_completion_tokens"]).to eq(999)

llm.generate(prompt, user: user, max_tokens: 100) { |chunk| result << chunk }
expect(body_parsed["max_completion_tokens"]).to eq(100)

llm.generate(prompt, user: user) { |chunk| result << chunk }
expect(body_parsed["max_completion_tokens"]).to eq(999)
end
end

Expand Down
6 changes: 3 additions & 3 deletions spec/system/llms/ai_llm_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
it "correctly sets defaults" do
visit "/admin/plugins/discourse-ai/ai-llms"

find("[data-llm-id='anthropic-claude-3-5-haiku'] button").click()
find("[data-llm-id='anthropic-claude-opus-4-0'] button").click()
form.field("api_key").fill_in("abcd")
form.field("enabled_chat_bot").toggle
form.submit
Expand All @@ -26,9 +26,9 @@
expect(llm.api_key).to eq("abcd")

preset = DiscourseAi::Completions::Llm.presets.find { |p| p[:id] == "anthropic" }
model_preset = preset[:models].find { |m| m[:name] == "claude-3-5-haiku" }
model_preset = preset[:models].find { |m| m[:name] == "claude-opus-4-0" }

expect(llm.name).to eq("claude-3-5-haiku")
expect(llm.name).to eq("claude-opus-4-0")
expect(llm.url).to eq(preset[:endpoint])
expect(llm.tokenizer).to eq(preset[:tokenizer].to_s)
expect(llm.max_prompt_tokens.to_i).to eq(model_preset[:tokens])
Expand Down
Loading