Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 28af443

Browse files
authored
Dev: eval improvements (#1162)
Adds sonnet 3.7 Adds support for temp in eval framework
1 parent f6eedf3 commit 28af443

File tree

2 files changed

+19
-5
lines changed

2 files changed

+19
-5
lines changed

config/eval-llms.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ llms:
3939
max_prompt_tokens: 200000
4040
vision_enabled: true
4141

42+
claude-3.7-sonnet:
43+
display_name: Claude 3.7 Sonnet
44+
name: claude-3-7-sonnet-latest
45+
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
46+
api_key_env: ANTHROPIC_API_KEY
47+
provider: anthropic
48+
url: https://api.anthropic.com/v1/messages
49+
max_prompt_tokens: 200000
50+
vision_enabled: true
51+
4252
gemini-2.0-flash:
4353
display_name: Gemini 2.0 Flash
4454
name: gemini-2-0-flash

evals/lib/eval.rb

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def to_json
121121
def judge_result(result)
122122
prompt = judge[:prompt].dup
123123
prompt.sub!("{{output}}", result)
124-
prompt.sub!("{{input}}", args[:input])
124+
args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
125125

126126
prompt += <<~SUFFIX
127127
@@ -145,7 +145,8 @@ def judge_result(result)
145145
messages: [{ type: :user, content: prompt }],
146146
)
147147

148-
result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
148+
result =
149+
judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user, temperature: 0)
149150

150151
if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
151152
rating = rating[1].to_i
@@ -219,7 +220,7 @@ def pdf_to_text(llm, path:)
219220
upload.destroy if upload
220221
end
221222

222-
def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false)
223+
def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false)
223224
if tools
224225
tools.each do |tool|
225226
tool.symbolize_keys!
@@ -230,16 +231,19 @@ def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false)
230231
DiscourseAi::Completions::Prompt.new(
231232
system_prompt,
232233
messages: [{ type: :user, content: message }],
233-
tools: tools,
234234
)
235235

236+
prompt.tools = tools if tools
237+
236238
result = nil
237239
if stream
238240
result = []
239241
llm
240242
.llm_model
241243
.to_llm
242-
.generate(prompt, user: Discourse.system_user) { |partial| result << partial }
244+
.generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial|
245+
result << partial
246+
end
243247
else
244248
result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
245249
end

0 commit comments

Comments
 (0)