Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 46 additions & 55 deletions evals/lib/eval.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,51 @@ def run(llm:)
when "image_to_text"
image_to_text(llm, **args)
when "prompt"
prompt_call(llm, **args)
DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(**args)
when "edit_artifact"
edit_artifact(llm, **args)
when "summarization"
summarization(llm, **args)
end

classify_results(result)
rescue EvalError => e
{ result: :fail, message: e.message, context: e.context }
end

def print
puts "#{id}: #{description}"
end

def to_json
{
type: @type,
path: @path,
name: @name,
description: @description,
id: @id,
args: @args,
vision: @vision,
expected_output: @expected_output,
expected_output_regex: @expected_output_regex,
}.compact
end

private

# @param result [String, Array<Hash>] the result of the eval, either
# "llm response" or [{ result: "llm response", other_attrs: here }]
# @return [Array<Hash>] an array of hashes with the result classified
# as pass or fail, along with extra attributes
def classify_results(result)
if result.is_a?(Array)
result.each { |r| r.merge!(classify_result_pass_fail(r)) }
else
[classify_result_pass_fail(result)]
end
end

def classify_result_pass_fail(result)
if expected_output
if result == expected_output
{ result: :pass }
Expand Down Expand Up @@ -94,34 +132,17 @@ def run(llm:)
else
{ result: :pass }
end
rescue EvalError => e
{ result: :fail, message: e.message, context: e.context }
end

def print
puts "#{id}: #{description}"
end

def to_json
{
type: @type,
path: @path,
name: @name,
description: @description,
id: @id,
args: @args,
vision: @vision,
expected_output: @expected_output,
expected_output_regex: @expected_output_regex,
}.compact
end

private

def judge_result(result)
prompt = judge[:prompt].dup
prompt.sub!("{{output}}", result)
args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
if result.is_a?(String)
prompt.sub!("{{output}}", result)
args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
else
prompt.sub!("{{output}}", result[:result])
result.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) }
end

prompt += <<~SUFFIX

Expand Down Expand Up @@ -220,36 +241,6 @@ def pdf_to_text(llm, path:)
upload.destroy if upload
end

def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false)
if tools
tools.each do |tool|
tool.symbolize_keys!
tool[:parameters].symbolize_keys! if tool[:parameters]
end
end
prompt =
DiscourseAi::Completions::Prompt.new(
system_prompt,
messages: [{ type: :user, content: message }],
)

prompt.tools = tools if tools

result = nil
if stream
result = []
llm
.llm_model
.to_llm
.generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial|
result << partial
end
else
result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
end
result
end

def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
css = File.read(css_path)
js = File.read(js_path)
Expand Down
52 changes: 52 additions & 0 deletions evals/lib/prompt_evaluator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# frozen_string_literal: true

class DiscourseAi::Evals::PromptEvaluator
def initialize(llm)
@llm = llm.llm_model.to_llm
end

def prompt_call(prompts:, messages: nil, temperature: nil, tools: nil, stream: false)
tools = symbolize_tools(tools)
total = prompts.size * messages.size
count = 0
puts ""

prompts.flat_map do |prompt|
messages.map do |content|
count += 1
print "\rProcessing #{count}/#{total}"

c_prompt =
DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: }])
c_prompt.tools = tools if tools
result = { prompt:, message: content }
result[:result] = generate_result(c_prompt, temperature, stream)
result
end
end
ensure
print "\r\033[K"
end

private

def generate_result(c_prompt, temperature, stream)
if stream
stream_result = []
@llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) do |partial|
stream_result << partial
end
stream_result
else
@llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature)
end
end

def symbolize_tools(tools)
return nil if tools.nil?

tools.map do |tool|
{ name: tool["name"], parameters: tool["parameters"]&.transform_keys(&:to_sym) }.compact
end
end
end
50 changes: 26 additions & 24 deletions evals/lib/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -148,31 +148,33 @@ def run!
structured_logger.step("Evaluating with LLM: #{llm.name}") do |step|
logger.info("Evaluating with LLM: #{llm.name}")
print "#{llm.name}: "
result = @eval.run(llm: llm)

step[:args] = result
step[:cname] = result[:result] == :pass ? :good : :bad

if result[:result] == :fail
puts "Failed 🔴"
puts "Error: #{result[:message]}" if result[:message]
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
if result[:expected_output] && result[:actual_output]
puts "---- Expected ----\n#{result[:expected_output]}"
puts "---- Actual ----\n#{result[:actual_output]}"
results = @eval.run(llm: llm)

results.each do |result|
step[:args] = result
step[:cname] = result[:result] == :pass ? :good : :bad

if result[:result] == :fail
puts "Failed 🔴"
puts "Error: #{result[:message]}" if result[:message]
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
if result[:expected_output] && result[:actual_output]
puts "---- Expected ----\n#{result[:expected_output]}"
puts "---- Actual ----\n#{result[:actual_output]}"
end
logger.error("Evaluation failed with LLM: #{llm.name}")
logger.error("Error: #{result[:message]}") if result[:message]
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
logger.error("Context: #{result[:context]}") if result[:context]
elsif result[:result] == :pass
puts "Passed 🟢"
logger.info("Evaluation passed with LLM: #{llm.name}")
else
STDERR.puts "Error: Unknown result #{eval.inspect}"
logger.error("Unknown result: #{eval.inspect}")
end
logger.error("Evaluation failed with LLM: #{llm.name}")
logger.error("Error: #{result[:message]}") if result[:message]
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
logger.error("Context: #{result[:context]}") if result[:context]
elsif result[:result] == :pass
puts "Passed 🟢"
logger.info("Evaluation passed with LLM: #{llm.name}")
else
STDERR.puts "Error: Unknown result #{eval.inspect}"
logger.error("Unknown result: #{eval.inspect}")
end
end
end
Expand Down
1 change: 1 addition & 0 deletions evals/run
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require_relative "lib/llm"
require_relative "lib/cli"
require_relative "lib/runner"
require_relative "lib/eval"
require_relative "lib/prompt_evaluator"

options = DiscourseAi::Evals::Cli.parse_options!

Expand Down