Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,26 @@
**Plugin Summary**

For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco

### Evals

The directory `evals` contains AI evals for the Discourse AI plugin.

To run them use:

cd evals
./run --help

```
Usage: evals/run [options]
-e, --eval NAME Name of the evaluation to run
--list-models List models
-m, --model NAME Model to evaluate (will eval all models if not specified)
-l, --list List evals
```

To run evals you will need to configure API keys in your environment:

OPENAI_API_KEY=your_openai_api_key
ANTHROPIC_API_KEY=your_anthropic_api_key
GEMINI_API_KEY=your_gemini_api_key
13 changes: 13 additions & 0 deletions app/models/ai_artifact_version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base
validates :html, length: { maximum: 65_535 }
validates :css, length: { maximum: 65_535 }
validates :js, length: { maximum: 65_535 }

# used when generating test cases
def write_to(path)
css_path = "#{path}/main.css"
html_path = "#{path}/main.html"
js_path = "#{path}/main.js"
instructions_path = "#{path}/instructions.txt"

File.write(css_path, css)
File.write(html_path, html)
File.write(js_path, js)
File.write(instructions_path, change_description)
end
end

# == Schema Information
Expand Down
159 changes: 151 additions & 8 deletions evals/lib/eval.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,17 @@ class DiscourseAi::Evals::Eval
:vision,
:expected_output,
:expected_output_regex,
:expected_tool_call
:expected_tool_call,
:judge

class EvalError < StandardError
attr_reader :context

def initialize(message, context)
super(message)
@context = context
end
end

def initialize(path:)
@yaml = YAML.load_file(path).symbolize_keys
Expand All @@ -27,10 +37,14 @@ def initialize(path:)
Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
@expected_tool_call = @yaml[:expected_tool_call]
@expected_tool_call.symbolize_keys! if @expected_tool_call
@judge = @yaml[:judge]
@judge.symbolize_keys! if @judge

@args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?(
:path,
)
@args.each do |key, value|
if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
@args[key] = File.expand_path(File.join(File.dirname(path), value))
end
end
end

def run(llm:)
Expand All @@ -44,6 +58,8 @@ def run(llm:)
image_to_text(llm, **args)
when "prompt"
prompt_call(llm, **args)
when "edit_artifact"
edit_artifact(llm, **args)
end

if expected_output
Expand All @@ -53,7 +69,7 @@ def run(llm:)
{ result: :fail, expected_output: expected_output, actual_output: result }
end
elsif expected_output_regex
if result.match?(expected_output_regex)
if result.to_s.match?(expected_output_regex)
{ result: :pass }
else
{ result: :fail, expected_output: expected_output_regex, actual_output: result }
Expand All @@ -71,9 +87,13 @@ def run(llm:)
else
{ result: :pass }
end
elsif judge
judge_result(result)
else
{ result: :unknown, actual_output: result }
{ result: :pass }
end
rescue EvalError => e
{ result: :fail, message: e.message, context: e.context }
end

def print
Expand All @@ -96,14 +116,68 @@ def to_json

private

def helper(llm, input:, name:)
def judge_result(result)
prompt = judge[:prompt].dup
prompt.sub!("{{output}}", result)
prompt.sub!("{{input}}", args[:input])

prompt += <<~SUFFIX

Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.

example output:

[RATING]10[/RATING] perfect output

example output:

[RATING]5[/RATING]

the following failed to preserve... etc...
SUFFIX

judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first

DiscourseAi::Completions::Prompt.new(
"You are an expert judge tasked at testing LLM outputs.",
messages: [{ type: :user, content: prompt }],
)

result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)

if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
rating = rating[1].to_i
end

if rating.to_i >= judge[:pass_rating]
{ result: :pass }
else
{
result: :fail,
message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}",
context: result,
}
end
end

def helper(llm, input:, name:, locale: nil)
completion_prompt = CompletionPrompt.find_by(name: name)
helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
user = Discourse.system_user
if locale
user = User.new
class << user
attr_accessor :effective_locale
end

user.effective_locale = locale
user.admin = true
end
result =
helper.generate_and_send_prompt(
completion_prompt,
input,
current_user = Discourse.system_user,
current_user = user,
_force_default_locale = false,
)

Expand Down Expand Up @@ -169,4 +243,73 @@ def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false)
end
result
end

def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
css = File.read(css_path)
js = File.read(js_path)
html = File.read(html_path)
instructions = File.read(instructions_path)
artifact =
AiArtifact.create!(
css: css,
js: js,
html: html,
user_id: Discourse.system_user.id,
post_id: 1,
name: "eval artifact",
)

post = Post.new(topic_id: 1, id: 1)
diff =
DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
llm: llm.llm_model.to_llm,
post: post,
user: Discourse.system_user,
artifact: artifact,
artifact_version: nil,
instructions: instructions,
)
diff.apply

if diff.failed_searches.present?
puts "Eval Errors encountered"
p diff.failed_searches
raise EvalError.new("Failed to apply all changes", diff.failed_searches)
end

version = artifact.versions.last
raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js)

output = { css: version.css, js: version.js, html: version.html }

artifact.destroy
output
end

def valid_javascript?(str)
require "open3"

# Create a temporary file with the JavaScript code
Tempfile.create(%w[test .js]) do |f|
f.write(str)
f.flush

File.write("/tmp/test.js", str)

begin
Discourse::Utils.execute_command(
"node",
"--check",
f.path,
failure_message: "Invalid JavaScript syntax",
timeout: 30, # reasonable timeout in seconds
)
true
rescue Discourse::Utils::CommandError
false
end
end
rescue StandardError
false
end
end
13 changes: 11 additions & 2 deletions evals/lib/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,18 @@ def run!

if result[:result] == :fail
puts "Failed 🔴"
puts "---- Expected ----\n#{result[:expected_output]}"
puts "---- Actual ----\n#{result[:actual_output]}"
puts "Error: #{result[:message]}" if result[:message]
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
if result[:expected_output] && result[:actual_output]
puts "---- Expected ----\n#{result[:expected_output]}"
puts "---- Actual ----\n#{result[:actual_output]}"
end
logger.error("Evaluation failed with LLM: #{llm.name}")
logger.error("Error: #{result[:message]}") if result[:message]
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
logger.error("Context: #{result[:context]}") if result[:context]
elsif result[:result] == :pass
puts "Passed 🟢"
logger.info("Evaluation passed with LLM: #{llm.name}")
Expand Down
Loading