Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 0e628a1

Browse files
committed
allow for evals to have follow ups
1 parent fc7e213 commit 0e628a1

File tree

3 files changed

+119
-11
lines changed

3 files changed

+119
-11
lines changed

config/eval-llms.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,21 @@ llms:
4949
max_prompt_tokens: 200000
5050
vision_enabled: true
5151

52+
claude-3.7-sonnet-thinking:
53+
display_name: Claude 3.7 Sonnet
54+
name: claude-3-7-sonnet-latest
55+
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
56+
api_key_env: ANTHROPIC_API_KEY
57+
provider: anthropic
58+
url: https://api.anthropic.com/v1/messages
59+
max_prompt_tokens: 200000
60+
vision_enabled: true
61+
provider_params:
62+
disable_top_p: true
63+
disable_temperature: true
64+
enable_reasoning: true
65+
reasoning_tokens: 1024
66+
5267
gemini-2.0-flash:
5368
display_name: Gemini 2.0 Flash
5469
name: gemini-2-0-flash

evals/lib/eval.rb

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -220,11 +220,20 @@ def pdf_to_text(llm, path:)
220220
upload.destroy if upload
221221
end
222222

223-
def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false)
223+
def prompt_call(
224+
llm,
225+
system_prompt:,
226+
message:,
227+
temperature: nil,
228+
tools: nil,
229+
stream: false,
230+
output_thinking: false,
231+
followup: nil
232+
)
224233
if tools
225234
tools.each do |tool|
226235
tool.symbolize_keys!
227-
tool[:parameters].symbolize_keys! if tool[:parameters]
236+
tool[:parameters]&.each(&:symbolize_keys!)
228237
end
229238
end
230239
prompt =
@@ -236,16 +245,57 @@ def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, str
236245
prompt.tools = tools if tools
237246

238247
result = nil
239-
if stream
240-
result = []
241-
llm
242-
.llm_model
243-
.to_llm
244-
.generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial|
245-
result << partial
248+
generate_result =
249+
lambda do
250+
if stream
251+
result = []
252+
llm
253+
.llm_model
254+
.to_llm
255+
.generate(
256+
prompt,
257+
user: Discourse.system_user,
258+
temperature: temperature,
259+
output_thinking: output_thinking,
260+
) { |partial| result << partial }
261+
else
262+
result =
263+
llm.llm_model.to_llm.generate(
264+
prompt,
265+
user: Discourse.system_user,
266+
temperature: temperature,
267+
output_thinking: output_thinking,
268+
)
246269
end
247-
else
248-
result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
270+
end
271+
272+
generate_result.call
273+
274+
if followup
275+
followup = followup.dup.symbolize_keys!
276+
prompt.tools = followup[:tools] if followup[:tools]
277+
prompt.push_model_response(result)
278+
message = followup[:message].dup
279+
message = message.symbolize_keys!
280+
message[:type] = message[:type].to_sym if message[:type]
281+
%i[id name].each do |key|
282+
if message[key].is_a?(Array)
283+
type, inner_key = message[key]
284+
# this allows us to dynamically set the id or name of the tool call
285+
prev = prompt.messages.reverse.find { |m| m[:type] == type.to_sym }
286+
message[key] = prev[inner_key.to_sym] if prev
287+
end
288+
end
289+
290+
prompt.push(**message)
291+
292+
begin
293+
generate_result.call
294+
rescue => e
295+
# should not happen but it helps debugging...
296+
puts e
297+
result = []
298+
end
249299
end
250300
result
251301
end

lib/completions/prompt.rb

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,49 @@ def initialize(
4141
@tool_choice = tool_choice
4242
end
4343

44+
# this new api tries to create symmetry between responses and prompts
45+
# this means anything we get back from the model via endpoint can be easily appended
46+
def push_model_response(response)
47+
response = [response] if !response.is_a? Array
48+
49+
thinking, thinking_signature, redacted_thinking_signature = nil
50+
51+
response.each do |message|
52+
if message.is_a?(Thinking)
53+
# we can safely skip partials here
54+
next if message.partial?
55+
if message.redacted
56+
redacted_thinking_signature = message.signature
57+
else
58+
thinking = message.message
59+
thinking_signature = message.signature
60+
end
61+
elsif message.is_a?(ToolCall)
62+
next if message.partial?
63+
# this is a bit surprising about the API
64+
# needing to add arguments is not ideal
65+
push(
66+
type: :tool_call,
67+
content: { arguments: message.parameters }.to_json,
68+
id: message.id,
69+
name: message.name,
70+
)
71+
elsif message.is_a?(String)
72+
push(type: :model, content: message)
73+
else
74+
raise ArgumentError, "response must be an array of strings, ToolCalls, or Thinkings"
75+
end
76+
end
77+
78+
# anthropic rules are that we attach thinking to last for the response
79+
# it is odd, I wonder if long term we just keep thinking as a separate object
80+
if thinking || redacted_thinking_signature
81+
messages.last[:thinking] = thinking
82+
messages.last[:thinking_signature] = thinking_signature
83+
messages.last[:redacted_thinking_signature] = redacted_thinking_signature
84+
end
85+
end
86+
4487
def push(
4588
type:,
4689
content:,

0 commit comments

Comments
 (0)