Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 3533cd1

Browse files
authored
DEV: Allow prompt-type evals to take in several prompts and messages (#1190)
* DEV: Allow prompt-type evals to take in several prompts and messages * ❄️
1 parent 51ca942 commit 3533cd1

File tree

4 files changed

+125
-79
lines changed

4 files changed

+125
-79
lines changed

evals/lib/eval.rb

Lines changed: 46 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,51 @@ def run(llm:)
5757
when "image_to_text"
5858
image_to_text(llm, **args)
5959
when "prompt"
60-
prompt_call(llm, **args)
60+
DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(**args)
6161
when "edit_artifact"
6262
edit_artifact(llm, **args)
6363
when "summarization"
6464
summarization(llm, **args)
6565
end
6666

67+
classify_results(result)
68+
rescue EvalError => e
69+
{ result: :fail, message: e.message, context: e.context }
70+
end
71+
72+
def print
73+
puts "#{id}: #{description}"
74+
end
75+
76+
def to_json
77+
{
78+
type: @type,
79+
path: @path,
80+
name: @name,
81+
description: @description,
82+
id: @id,
83+
args: @args,
84+
vision: @vision,
85+
expected_output: @expected_output,
86+
expected_output_regex: @expected_output_regex,
87+
}.compact
88+
end
89+
90+
private
91+
92+
# @param result [String, Array<Hash>] the result of the eval, either
93+
# "llm response" or [{ result: "llm response", other_attrs: here }]
94+
# @return [Array<Hash>] an array of hashes with the result classified
95+
# as pass or fail, along with extra attributes
96+
def classify_results(result)
97+
if result.is_a?(Array)
98+
result.each { |r| r.merge!(classify_result_pass_fail(r)) }
99+
else
100+
[classify_result_pass_fail(result)]
101+
end
102+
end
103+
104+
def classify_result_pass_fail(result)
67105
if expected_output
68106
if result == expected_output
69107
{ result: :pass }
@@ -94,34 +132,17 @@ def run(llm:)
94132
else
95133
{ result: :pass }
96134
end
97-
rescue EvalError => e
98-
{ result: :fail, message: e.message, context: e.context }
99135
end
100136

101-
def print
102-
puts "#{id}: #{description}"
103-
end
104-
105-
def to_json
106-
{
107-
type: @type,
108-
path: @path,
109-
name: @name,
110-
description: @description,
111-
id: @id,
112-
args: @args,
113-
vision: @vision,
114-
expected_output: @expected_output,
115-
expected_output_regex: @expected_output_regex,
116-
}.compact
117-
end
118-
119-
private
120-
121137
def judge_result(result)
122138
prompt = judge[:prompt].dup
123-
prompt.sub!("{{output}}", result)
124-
args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
139+
if result.is_a?(String)
140+
prompt.sub!("{{output}}", result)
141+
args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
142+
else
143+
prompt.sub!("{{output}}", result[:result])
144+
result.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
145+
end
125146

126147
prompt += <<~SUFFIX
127148
@@ -220,36 +241,6 @@ def pdf_to_text(llm, path:)
220241
upload.destroy if upload
221242
end
222243

223-
def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false)
224-
if tools
225-
tools.each do |tool|
226-
tool.symbolize_keys!
227-
tool[:parameters].symbolize_keys! if tool[:parameters]
228-
end
229-
end
230-
prompt =
231-
DiscourseAi::Completions::Prompt.new(
232-
system_prompt,
233-
messages: [{ type: :user, content: message }],
234-
)
235-
236-
prompt.tools = tools if tools
237-
238-
result = nil
239-
if stream
240-
result = []
241-
llm
242-
.llm_model
243-
.to_llm
244-
.generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial|
245-
result << partial
246-
end
247-
else
248-
result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
249-
end
250-
result
251-
end
252-
253244
def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
254245
css = File.read(css_path)
255246
js = File.read(js_path)

evals/lib/prompt_evaluator.rb

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# frozen_string_literal: true
2+
3+
class DiscourseAi::Evals::PromptEvaluator
4+
def initialize(llm)
5+
@llm = llm.llm_model.to_llm
6+
end
7+
8+
def prompt_call(prompts:, messages: nil, temperature: nil, tools: nil, stream: false)
9+
tools = symbolize_tools(tools)
10+
total = prompts.size * messages.size
11+
count = 0
12+
puts ""
13+
14+
prompts.flat_map do |prompt|
15+
messages.map do |content|
16+
count += 1
17+
print "\rProcessing #{count}/#{total}"
18+
19+
c_prompt =
20+
DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: }])
21+
c_prompt.tools = tools if tools
22+
result = { prompt:, message: content }
23+
result[:result] = generate_result(c_prompt, temperature, stream)
24+
result
25+
end
26+
end
27+
ensure
28+
print "\r\033[K"
29+
end
30+
31+
private
32+
33+
def generate_result(c_prompt, temperature, stream)
34+
if stream
35+
stream_result = []
36+
@llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) do |partial|
37+
stream_result << partial
38+
end
39+
stream_result
40+
else
41+
@llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature)
42+
end
43+
end
44+
45+
def symbolize_tools(tools)
46+
return nil if tools.nil?
47+
48+
tools.map do |tool|
49+
{ name: tool["name"], parameters: tool["parameters"]&.transform_keys(&:to_sym) }.compact
50+
end
51+
end
52+
end

evals/lib/runner.rb

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -148,31 +148,33 @@ def run!
148148
structured_logger.step("Evaluating with LLM: #{llm.name}") do |step|
149149
logger.info("Evaluating with LLM: #{llm.name}")
150150
print "#{llm.name}: "
151-
result = @eval.run(llm: llm)
152-
153-
step[:args] = result
154-
step[:cname] = result[:result] == :pass ? :good : :bad
155-
156-
if result[:result] == :fail
157-
puts "Failed 🔴"
158-
puts "Error: #{result[:message]}" if result[:message]
159-
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
160-
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
161-
if result[:expected_output] && result[:actual_output]
162-
puts "---- Expected ----\n#{result[:expected_output]}"
163-
puts "---- Actual ----\n#{result[:actual_output]}"
151+
results = @eval.run(llm: llm)
152+
153+
results.each do |result|
154+
step[:args] = result
155+
step[:cname] = result[:result] == :pass ? :good : :bad
156+
157+
if result[:result] == :fail
158+
puts "Failed 🔴"
159+
puts "Error: #{result[:message]}" if result[:message]
160+
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
161+
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
162+
if result[:expected_output] && result[:actual_output]
163+
puts "---- Expected ----\n#{result[:expected_output]}"
164+
puts "---- Actual ----\n#{result[:actual_output]}"
165+
end
166+
logger.error("Evaluation failed with LLM: #{llm.name}")
167+
logger.error("Error: #{result[:message]}") if result[:message]
168+
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
169+
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
170+
logger.error("Context: #{result[:context]}") if result[:context]
171+
elsif result[:result] == :pass
172+
puts "Passed 🟢"
173+
logger.info("Evaluation passed with LLM: #{llm.name}")
174+
else
175+
STDERR.puts "Error: Unknown result #{eval.inspect}"
176+
logger.error("Unknown result: #{eval.inspect}")
164177
end
165-
logger.error("Evaluation failed with LLM: #{llm.name}")
166-
logger.error("Error: #{result[:message]}") if result[:message]
167-
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
168-
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
169-
logger.error("Context: #{result[:context]}") if result[:context]
170-
elsif result[:result] == :pass
171-
puts "Passed 🟢"
172-
logger.info("Evaluation passed with LLM: #{llm.name}")
173-
else
174-
STDERR.puts "Error: Unknown result #{eval.inspect}"
175-
logger.error("Unknown result: #{eval.inspect}")
176178
end
177179
end
178180
end

evals/run

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ require_relative "lib/llm"
66
require_relative "lib/cli"
77
require_relative "lib/runner"
88
require_relative "lib/eval"
9+
require_relative "lib/prompt_evaluator"
910

1011
options = DiscourseAi::Evals::Cli.parse_options!
1112

0 commit comments

Comments
 (0)