discourse · SamSaffron · Feb 19, 2025 · Feb 17, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/README.md b/README.md
@@ -3,3 +3,26 @@
 **Plugin Summary**
 
 For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco
+
+### Evals
+
+The directory `evals` contains AI evals for the Discourse AI plugin.
+
+To run them use: 
+
+cd evals
+./run --help
+
+```
+Usage: evals/run [options]
+    -e, --eval NAME                  Name of the evaluation to run
+        --list-models                List models
+    -m, --model NAME                 Model to evaluate (will eval all models if not specified)
+    -l, --list                       List evals
+```
+
+To run evals you will need to configure API keys in your environment:
+
+OPENAI_API_KEY=your_openai_api_key
+ANTHROPIC_API_KEY=your_anthropic_api_key
+GEMINI_API_KEY=your_gemini_api_key
diff --git a/app/models/ai_artifact_version.rb b/app/models/ai_artifact_version.rb
@@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base
   validates :html, length: { maximum: 65_535 }
   validates :css, length: { maximum: 65_535 }
   validates :js, length: { maximum: 65_535 }
+
+  # used when generating test cases
+  def write_to(path)
+    css_path = "#{path}/main.css"
+    html_path = "#{path}/main.html"
+    js_path = "#{path}/main.js"
+    instructions_path = "#{path}/instructions.txt"
+
+    File.write(css_path, css)
+    File.write(html_path, html)
+    File.write(js_path, js)
+    File.write(instructions_path, change_description)
+  end
 end
 
 # == Schema Information

diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb
@@ -10,7 +10,17 @@ class DiscourseAi::Evals::Eval
               :vision,
               :expected_output,
               :expected_output_regex,
-              :expected_tool_call
+              :expected_tool_call,
+              :judge
+
+  class EvalError < StandardError
+    attr_reader :context
+
+    def initialize(message, context)
+      super(message)
+      @context = context
+    end
+  end
 
   def initialize(path:)
     @yaml = YAML.load_file(path).symbolize_keys
@@ -27,10 +37,14 @@ def initialize(path:)
       Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
     @expected_tool_call = @yaml[:expected_tool_call]
     @expected_tool_call.symbolize_keys! if @expected_tool_call
+    @judge = @yaml[:judge]
+    @judge.symbolize_keys! if @judge
 
-    @args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?(
-      :path,
-    )
+    @args.each do |key, value|
+      if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
+        @args[key] = File.expand_path(File.join(File.dirname(path), value))
+      end
+    end
   end
 
   def run(llm:)
@@ -44,6 +58,8 @@ def run(llm:)
         image_to_text(llm, **args)
       when "prompt"
         prompt_call(llm, **args)
+      when "edit_artifact"
+        edit_artifact(llm, **args)
       end
 
     if expected_output
@@ -53,7 +69,7 @@ def run(llm:)
         { result: :fail, expected_output: expected_output, actual_output: result }
       end
     elsif expected_output_regex
-      if result.match?(expected_output_regex)
+      if result.to_s.match?(expected_output_regex)
         { result: :pass }
       else
         { result: :fail, expected_output: expected_output_regex, actual_output: result }
@@ -71,9 +87,13 @@ def run(llm:)
       else
         { result: :pass }
       end
+    elsif judge
+      judge_result(result)
     else
-      { result: :unknown, actual_output: result }
+      { result: :pass }
     end
+  rescue EvalError => e
+    { result: :fail, message: e.message, context: e.context }
   end
 
   def print
@@ -96,14 +116,68 @@ def to_json
 
   private
 
-  def helper(llm, input:, name:)
+  def judge_result(result)
+    prompt = judge[:prompt].dup
+    prompt.sub!("{{output}}", result)
+    prompt.sub!("{{input}}", args[:input])
+
+    prompt += <<~SUFFIX
+
+      Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.
+
+      example output:
+
+      [RATING]10[/RATING] perfect output
+
+      example output:
+
+      [RATING]5[/RATING]
+
+      the following failed to preserve... etc...
+    SUFFIX
+
+    judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first
+
+    DiscourseAi::Completions::Prompt.new(
+      "You are an expert judge tasked at testing LLM outputs.",
+      messages: [{ type: :user, content: prompt }],
+    )
+
+    result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
+
+    if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
+      rating = rating[1].to_i
+    end
+
+    if rating.to_i >= judge[:pass_rating]
+      { result: :pass }
+    else
+      {
+        result: :fail,
+        message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}",
+        context: result,
+      }
+    end
+  end
+
+  def helper(llm, input:, name:, locale: nil)
     completion_prompt = CompletionPrompt.find_by(name: name)
     helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
+    user = Discourse.system_user
+    if locale
+      user = User.new
+      class << user
+        attr_accessor :effective_locale
+      end
+
+      user.effective_locale = locale
+      user.admin = true
+    end
     result =
       helper.generate_and_send_prompt(
         completion_prompt,
         input,
-        current_user = Discourse.system_user,
+        current_user = user,
         _force_default_locale = false,
       )
 
@@ -169,4 +243,73 @@ def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false)
     end
     result
   end
+
+  def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
+    css = File.read(css_path)
+    js = File.read(js_path)
+    html = File.read(html_path)
+    instructions = File.read(instructions_path)
+    artifact =
+      AiArtifact.create!(
+        css: css,
+        js: js,
+        html: html,
+        user_id: Discourse.system_user.id,
+        post_id: 1,
+        name: "eval artifact",
+      )
+
+    post = Post.new(topic_id: 1, id: 1)
+    diff =
+      DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
+        llm: llm.llm_model.to_llm,
+        post: post,
+        user: Discourse.system_user,
+        artifact: artifact,
+        artifact_version: nil,
+        instructions: instructions,
+      )
+    diff.apply
+
+    if diff.failed_searches.present?
+      puts "Eval Errors encountered"
+      p diff.failed_searches
+      raise EvalError.new("Failed to apply all changes", diff.failed_searches)
+    end
+
+    version = artifact.versions.last
+    raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js)
+
+    output = { css: version.css, js: version.js, html: version.html }
+
+    artifact.destroy
+    output
+  end
+
+  def valid_javascript?(str)
+    require "open3"
+
+    # Create a temporary file with the JavaScript code
+    Tempfile.create(%w[test .js]) do |f|
+      f.write(str)
+      f.flush
+
+      File.write("/tmp/test.js", str)
+
+      begin
+        Discourse::Utils.execute_command(
+          "node",
+          "--check",
+          f.path,
+          failure_message: "Invalid JavaScript syntax",
+          timeout: 30, # reasonable timeout in seconds
+        )
+        true
+      rescue Discourse::Utils::CommandError
+        false
+      end
+    end
+  rescue StandardError
+    false
+  end
 end
diff --git a/evals/lib/runner.rb b/evals/lib/runner.rb
@@ -155,9 +155,18 @@ def run!
 
           if result[:result] == :fail
             puts "Failed 🔴"
-            puts "---- Expected ----\n#{result[:expected_output]}"
-            puts "---- Actual ----\n#{result[:actual_output]}"
+            puts "Error: #{result[:message]}" if result[:message]
+            # this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
+            #puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
+            if result[:expected_output] && result[:actual_output]
+              puts "---- Expected ----\n#{result[:expected_output]}"
+              puts "---- Actual ----\n#{result[:actual_output]}"
+            end
             logger.error("Evaluation failed with LLM: #{llm.name}")
+            logger.error("Error: #{result[:message]}") if result[:message]
+            logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
+            logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
+            logger.error("Context: #{result[:context]}") if result[:context]
           elsif result[:result] == :pass
             puts "Passed 🟢"
             logger.info("Evaluation passed with LLM: #{llm.name}")