Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 1f877fb

Browse files
committed
allow for evals to have follow ups
1 parent 5bd9710 commit 1f877fb

File tree

3 files changed

+138
-0
lines changed

3 files changed

+138
-0
lines changed

config/eval-llms.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,21 @@ llms:
4949
max_prompt_tokens: 200000
5050
vision_enabled: true
5151

52+
claude-3.7-sonnet-thinking:
53+
display_name: Claude 3.7 Sonnet
54+
name: claude-3-7-sonnet-latest
55+
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
56+
api_key_env: ANTHROPIC_API_KEY
57+
provider: anthropic
58+
url: https://api.anthropic.com/v1/messages
59+
max_prompt_tokens: 200000
60+
vision_enabled: true
61+
provider_params:
62+
disable_top_p: true
63+
disable_temperature: true
64+
enable_reasoning: true
65+
reasoning_tokens: 1024
66+
5267
gemini-2.0-flash:
5368
display_name: Gemini 2.0 Flash
5469
name: gemini-2-0-flash

evals/lib/eval.rb

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,86 @@ def pdf_to_text(llm, path:)
241241
upload.destroy if upload
242242
end
243243

244+
def prompt_call(
245+
llm,
246+
system_prompt:,
247+
message:,
248+
temperature: nil,
249+
tools: nil,
250+
stream: false,
251+
output_thinking: false,
252+
followup: nil
253+
)
254+
if tools
255+
tools.each do |tool|
256+
tool.symbolize_keys!
257+
tool[:parameters]&.each(&:symbolize_keys!)
258+
end
259+
end
260+
prompt =
261+
DiscourseAi::Completions::Prompt.new(
262+
system_prompt,
263+
messages: [{ type: :user, content: message }],
264+
)
265+
266+
prompt.tools = tools if tools
267+
268+
result = nil
269+
generate_result =
270+
lambda do
271+
if stream
272+
result = []
273+
llm
274+
.llm_model
275+
.to_llm
276+
.generate(
277+
prompt,
278+
user: Discourse.system_user,
279+
temperature: temperature,
280+
output_thinking: output_thinking,
281+
) { |partial| result << partial }
282+
else
283+
result =
284+
llm.llm_model.to_llm.generate(
285+
prompt,
286+
user: Discourse.system_user,
287+
temperature: temperature,
288+
output_thinking: output_thinking,
289+
)
290+
end
291+
end
292+
293+
generate_result.call
294+
295+
if followup
296+
followup = followup.dup.symbolize_keys!
297+
prompt.tools = followup[:tools] if followup[:tools]
298+
prompt.push_model_response(result)
299+
message = followup[:message].dup
300+
message = message.symbolize_keys!
301+
message[:type] = message[:type].to_sym if message[:type]
302+
%i[id name].each do |key|
303+
if message[key].is_a?(Array)
304+
type, inner_key = message[key]
305+
# this allows us to dynamically set the id or name of the tool call
306+
prev = prompt.messages.reverse.find { |m| m[:type] == type.to_sym }
307+
message[key] = prev[inner_key.to_sym] if prev
308+
end
309+
end
310+
311+
prompt.push(**message)
312+
313+
begin
314+
generate_result.call
315+
rescue => e
316+
# should not happen but it helps debugging...
317+
puts e
318+
result = []
319+
end
320+
end
321+
result
322+
end
323+
244324
def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
245325
css = File.read(css_path)
246326
js = File.read(js_path)

lib/completions/prompt.rb

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,49 @@ def initialize(
4141
@tool_choice = tool_choice
4242
end
4343

44+
# this new api tries to create symmetry between responses and prompts
45+
# this means anything we get back from the model via endpoint can be easily appended
46+
def push_model_response(response)
47+
response = [response] if !response.is_a? Array
48+
49+
thinking, thinking_signature, redacted_thinking_signature = nil
50+
51+
response.each do |message|
52+
if message.is_a?(Thinking)
53+
# we can safely skip partials here
54+
next if message.partial?
55+
if message.redacted
56+
redacted_thinking_signature = message.signature
57+
else
58+
thinking = message.message
59+
thinking_signature = message.signature
60+
end
61+
elsif message.is_a?(ToolCall)
62+
next if message.partial?
63+
# this is a bit surprising about the API
64+
# needing to add arguments is not ideal
65+
push(
66+
type: :tool_call,
67+
content: { arguments: message.parameters }.to_json,
68+
id: message.id,
69+
name: message.name,
70+
)
71+
elsif message.is_a?(String)
72+
push(type: :model, content: message)
73+
else
74+
raise ArgumentError, "response must be an array of strings, ToolCalls, or Thinkings"
75+
end
76+
end
77+
78+
# anthropic rules are that we attach thinking to last for the response
79+
# it is odd, I wonder if long term we just keep thinking as a separate object
80+
if thinking || redacted_thinking_signature
81+
messages.last[:thinking] = thinking
82+
messages.last[:thinking_signature] = thinking_signature
83+
messages.last[:redacted_thinking_signature] = redacted_thinking_signature
84+
end
85+
end
86+
4487
def push(
4588
type:,
4689
content:,

0 commit comments

Comments
 (0)