Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit ca95a10

Browse files
committed
llm as a judge
1 parent 72a5078 commit ca95a10

File tree

2 files changed

+64
-3
lines changed

2 files changed

+64
-3
lines changed

evals/lib/eval.rb

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ class DiscourseAi::Evals::Eval
1010
:vision,
1111
:expected_output,
1212
:expected_output_regex,
13-
:expected_tool_call
13+
:expected_tool_call,
14+
:judge
1415

1516
class EvalError < StandardError
1617
attr_reader :context
@@ -36,6 +37,8 @@ def initialize(path:)
3637
Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
3738
@expected_tool_call = @yaml[:expected_tool_call]
3839
@expected_tool_call.symbolize_keys! if @expected_tool_call
40+
@judge = @yaml[:judge]
41+
@judge.symbolize_keys! if @judge
3942

4043
@args.each do |key, value|
4144
if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
@@ -84,6 +87,8 @@ def run(llm:)
8487
else
8588
{ result: :pass }
8689
end
90+
elsif judge
91+
judge_result(result)
8792
else
8893
{ result: :pass }
8994
end
@@ -111,14 +116,68 @@ def to_json
111116

112117
private
113118

114-
def helper(llm, input:, name:)
119+
def judge_result(result)
120+
prompt = judge[:prompt].dup
121+
prompt.sub!("{{output}}", result)
122+
prompt.sub!("{{input}}", args[:input])
123+
124+
prompt += <<~SUFFIX
125+
126+
Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.
127+
128+
example output:
129+
130+
[RATING]10[/RATING] perfect output
131+
132+
example output:
133+
134+
[RATING]5[/RATING]
135+
136+
the following failed to preserve... etc...
137+
SUFFIX
138+
139+
judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first
140+
141+
DiscourseAi::Completions::Prompt.new(
142+
"You are an expert judge tasked at testing LLM outputs.",
143+
messages: [{ type: :user, content: prompt }],
144+
)
145+
146+
result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
147+
148+
if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
149+
rating = rating[1].to_i
150+
end
151+
152+
if rating.to_i >= judge[:pass_rating]
153+
{ result: :pass }
154+
else
155+
{
156+
result: :fail,
157+
message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}",
158+
context: result,
159+
}
160+
end
161+
end
162+
163+
def helper(llm, input:, name:, locale: nil)
115164
completion_prompt = CompletionPrompt.find_by(name: name)
116165
helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
166+
user = Discourse.system_user
167+
if locale
168+
user = User.new
169+
class << user
170+
attr_accessor :effective_locale
171+
end
172+
173+
user.effective_locale = locale
174+
user.admin = true
175+
end
117176
result =
118177
helper.generate_and_send_prompt(
119178
completion_prompt,
120179
input,
121-
current_user = Discourse.system_user,
180+
current_user = user,
122181
_force_default_locale = false,
123182
)
124183

evals/lib/runner.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ def run!
156156
if result[:result] == :fail
157157
puts "Failed 🔴"
158158
puts "Error: #{result[:message]}" if result[:message]
159+
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
160+
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
159161
if result[:expected_output] && result[:actual_output]
160162
puts "---- Expected ----\n#{result[:expected_output]}"
161163
puts "---- Actual ----\n#{result[:actual_output]}"

0 commit comments

Comments
 (0)