Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 5bf61ef

Browse files
authored
DEV: Support multiple tests per eval and followups per test (#1199)
See discourse/discourse-ai-evals#9 for format of prompts
1 parent 107f144 commit 5bf61ef

File tree

5 files changed

+171
-59
lines changed

5 files changed

+171
-59
lines changed

evals/lib/eval.rb

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ def initialize(path:)
2929
@id = @yaml[:id]
3030
@description = @yaml[:description]
3131
@vision = @yaml[:vision]
32-
@args = @yaml[:args]&.symbolize_keys
3332
@type = @yaml[:type]
3433
@expected_output = @yaml[:expected_output]
3534
@expected_output_regex = @yaml[:expected_output_regex]
@@ -39,10 +38,14 @@ def initialize(path:)
3938
@expected_tool_call.symbolize_keys! if @expected_tool_call
4039
@judge = @yaml[:judge]
4140
@judge.symbolize_keys! if @judge
42-
43-
@args.each do |key, value|
44-
if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
45-
@args[key] = File.expand_path(File.join(File.dirname(path), value))
41+
if @yaml[:args].is_a?(Array)
42+
@args = @yaml[:args].map(&:symbolize_keys)
43+
else
44+
@args = @yaml[:args].symbolize_keys
45+
@args.each do |key, value|
46+
if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
47+
@args[key] = File.expand_path(File.join(File.dirname(path), value))
48+
end
4649
end
4750
end
4851
end
@@ -57,7 +60,7 @@ def run(llm:)
5760
when "image_to_text"
5861
image_to_text(llm, **args)
5962
when "prompt"
60-
DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(**args)
63+
DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(args)
6164
when "edit_artifact"
6265
edit_artifact(llm, **args)
6366
when "summarization"

evals/lib/prompt_evaluator.rb

Lines changed: 0 additions & 52 deletions
This file was deleted.
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# frozen_string_literal: true
2+
3+
class DiscourseAi::Evals::PromptEvaluator
4+
def initialize(llm)
5+
@llm = llm.llm_model.to_llm
6+
end
7+
8+
def prompt_call(args)
9+
args = [args] if !args.is_a?(Array)
10+
runner = DiscourseAi::Evals::PromptSingleTestRunner.new(@llm)
11+
12+
with_tests_progress(total: args.size) do |bump_progress|
13+
args.flat_map do |test|
14+
bump_progress.call
15+
16+
prompts, messages, followups, output_thinking, stream, temperature, tools =
17+
symbolize_test_args(test)
18+
19+
prompts.flat_map do |prompt|
20+
messages.map do |message|
21+
runner.run_single_test(
22+
prompt,
23+
message,
24+
followups,
25+
output_thinking,
26+
stream,
27+
temperature,
28+
tools,
29+
)
30+
end
31+
end
32+
end
33+
end
34+
end
35+
36+
private
37+
38+
def symbolize_test_args(args)
39+
prompts = args[:prompts] || [args[:prompt]]
40+
messages = args[:messages] || [args[:message]]
41+
followups = symbolize_followups(args)
42+
output_thinking = args[:output_thinking] || false
43+
stream = args[:stream] || false
44+
temperature = args[:temperature]
45+
tools = symbolize_tools(args[:tools])
46+
[prompts, messages, followups, output_thinking, stream, temperature, tools]
47+
end
48+
49+
def symbolize_followups(args)
50+
return nil if args[:followups].nil? && args[:followup].nil?
51+
followups = args[:followups] || [args[:followup]]
52+
followups.map do |followup|
53+
followup = followup.dup.symbolize_keys!
54+
message = followup[:message].dup.symbolize_keys!
55+
message[:type] = message[:type].to_sym if message[:type]
56+
followup[:message] = message
57+
followup
58+
end
59+
end
60+
61+
def symbolize_tools(tools)
62+
return nil if tools.nil?
63+
tools.map do |tool|
64+
tool.symbolize_keys!
65+
tool.merge(
66+
parameters: tool[:parameters]&.map { |param| param.transform_keys(&:to_sym) },
67+
).compact
68+
end
69+
end
70+
71+
def with_tests_progress(total:)
72+
puts ""
73+
count = 0
74+
result =
75+
yield(
76+
-> do
77+
count += 1
78+
print "\rProcessing test #{count}/#{total}"
79+
end
80+
)
81+
print "\r\033[K"
82+
result
83+
end
84+
end
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# frozen_string_literal: true
2+
3+
class DiscourseAi::Evals::PromptSingleTestRunner
4+
def initialize(llm)
5+
@llm = llm
6+
end
7+
8+
# Run a single test with a prompt and message, and some model settings
9+
# @param prompt [String] the prompt to use
10+
# @param message [String] the message to use
11+
# @param followups [Array<Hash>] an array of followups (messages) to run after the initial prompt
12+
# @param output_thinking [Boolean] whether to output the thinking state of the model
13+
# @param stream [Boolean] whether to stream the output of the model
14+
# @param temperature [Float] the temperature to use when generating completions
15+
# @param tools [Array<Hash>] an array of tools to use when generating completions
16+
# @return [Hash] the prompt, message, and result of the test
17+
def run_single_test(prompt, message, followups, output_thinking, stream, temperature, tools)
18+
@c_prompt =
19+
DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: message }])
20+
@c_prompt.tools = tools if tools
21+
generate_result(temperature, output_thinking, stream)
22+
23+
if followups
24+
followups.each do |followup|
25+
generate_followup(followup, output_thinking, stream, temperature)
26+
end
27+
end
28+
29+
{ prompt:, message:, result: @result }
30+
end
31+
32+
private
33+
34+
def generate_followup(followup, output_thinking, stream, temperature)
35+
@c_prompt.push_model_response(@result)
36+
followup_message = set_followup_tool(followup)
37+
@c_prompt.push(**followup_message)
38+
begin
39+
generate_result(temperature, output_thinking, stream)
40+
rescue => e
41+
# should not happen but it helps debugging...
42+
puts e
43+
result = []
44+
end
45+
end
46+
47+
def set_followup_tool(followup)
48+
@c_prompt.tools = followup[:tools] if followup[:tools]
49+
followup_message = followup[:message]
50+
%i[id name].each do |key|
51+
if followup_message[key].is_a?(Array)
52+
type, inner_key = followup_message[key]
53+
# this allows us to dynamically set the id or name of the tool call
54+
prev = @c_prompt.messages.reverse.find { |m| m[:type] == type.to_sym }
55+
followup_message[key] = prev[inner_key.to_sym] if prev
56+
end
57+
end
58+
followup_message
59+
end
60+
61+
def generate_result(temperature, output_thinking, stream)
62+
@result =
63+
if stream
64+
stream_result = []
65+
@llm.generate(
66+
@c_prompt,
67+
user: Discourse.system_user,
68+
temperature:,
69+
output_thinking:,
70+
) { |partial| stream_result << partial }
71+
stream_result
72+
else
73+
@llm.generate(@c_prompt, user: Discourse.system_user, temperature:, output_thinking:)
74+
end
75+
end
76+
end

evals/run

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ require_relative "lib/llm"
66
require_relative "lib/cli"
77
require_relative "lib/runner"
88
require_relative "lib/eval"
9-
require_relative "lib/prompt_evaluator"
9+
require_relative "lib/prompts/prompt_evaluator"
10+
require_relative "lib/prompts/single_test_runner"
1011

1112
options = DiscourseAi::Evals::Cli.parse_options!
1213

0 commit comments

Comments
 (0)