From 8e2ffaed23d40b22d4d7d48cdc53f0c22232c615 Mon Sep 17 00:00:00 2001
From: Sam Saffron <sam.saffron@gmail.com>
Date: Mon, 17 Feb 2025 19:42:22 +1100
Subject: [PATCH 1/9] WIP: smarter edit algorithm

---
 lib/ai_bot/artifact_update_strategies/diff.rb | 10 +++---
 lib/utils/diff_utils/simple_diff.rb           | 33 +++++++++++++++++--
 spec/lib/utils/diff_utils/simple_diff_spec.rb | 26 +++++++++++++++
 3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/lib/ai_bot/artifact_update_strategies/diff.rb b/lib/ai_bot/artifact_update_strategies/diff.rb
index fc2c23459..0919cc164 100644
--- a/lib/ai_bot/artifact_update_strategies/diff.rb
+++ b/lib/ai_bot/artifact_update_strategies/diff.rb
@@ -98,16 +98,18 @@ def system_prompt
 
             1. Use EXACTLY this format for changes:
                <<<<<<< SEARCH
-               (exact code to find)
+               (first line of code to replace)
+               (other lines of code to avoid ambiguity)
+               (last line of code to replace)
                =======
                (replacement code)
                >>>>>>> REPLACE
             2. DO NOT modify the markers or add spaces around them
             3. DO NOT add explanations or comments within sections
             4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes
-            5. Ensure search text matches EXACTLY - partial matches will fail
-            6. Keep changes minimal and focused
-            7. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
+            5. Keep changes minimal and focused
+            6. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
+            7. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit
 
             JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it:
             #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")}
diff --git a/lib/utils/diff_utils/simple_diff.rb b/lib/utils/diff_utils/simple_diff.rb
index e03dfef34..034bd44b2 100644
--- a/lib/utils/diff_utils/simple_diff.rb
+++ b/lib/utils/diff_utils/simple_diff.rb
@@ -26,6 +26,8 @@ def apply(content, search, replace)
           lines = content.split("\n")
           search_lines = search.split("\n")
 
+          ### TODO implement me
+
           # 1. Try exact matching
           match_positions =
             find_matches(lines, search_lines) { |line, search_line| line == search_line }
@@ -38,7 +40,17 @@ def apply(content, search, replace)
               end
           end
 
-          # 3. Try fuzzy matching
+          # 3. Try non-contiguous line based stripped matching
+          if match_positions.empty?
+            if range = non_contiguous_match_range(lines, search_lines)
+              first_match, last_match = range
+              lines.slice!(first_match, last_match - first_match + 1)
+              lines.insert(first_match, *replace.split("\n"))
+              return lines.join("\n")
+            end
+          end
+
+          # 4. Try fuzzy matching
           if match_positions.empty?
             match_positions =
               find_matches(lines, search_lines) do |line, search_line|
@@ -46,7 +58,7 @@ def apply(content, search, replace)
               end
           end
 
-          # 4. Try block matching as last resort
+          # 5. Try block matching as last resort
           if match_positions.empty?
             if block_matches = find_block_matches(content, search)
               return replace_blocks(content, block_matches, replace)
@@ -68,6 +80,23 @@ def apply(content, search, replace)
 
         private
 
+        def non_contiguous_match_range(lines, search_lines)
+          first_idx = nil
+          last_idx = nil
+          search_index = 0
+
+          lines.each_with_index do |line, idx|
+            if line.strip == search_lines[search_index].strip
+              first_idx ||= idx
+              last_idx = idx
+              search_index += 1
+              return first_idx, last_idx if search_index == search_lines.length
+            end
+          end
+
+          nil
+        end
+
         def find_matches(lines, search_lines)
           matches = []
           max_index = lines.length - search_lines.length
diff --git a/spec/lib/utils/diff_utils/simple_diff_spec.rb b/spec/lib/utils/diff_utils/simple_diff_spec.rb
index 8a29028f2..eaf4a2e91 100644
--- a/spec/lib/utils/diff_utils/simple_diff_spec.rb
+++ b/spec/lib/utils/diff_utils/simple_diff_spec.rb
@@ -171,5 +171,31 @@
 
       expect(subject.apply(content, search, replace).strip).to eq(expected.strip)
     end
+
+    it "handles missing lines in search" do
+      original = <<~TEXT
+        line1
+         line2
+        line3
+        line4
+        line5
+        line1
+        line2
+      TEXT
+
+      search = <<~TEXT
+        line1
+         line3
+        line1
+      TEXT
+
+      replace = ""
+
+      expected = <<~TEXT
+        line2
+      TEXT
+
+      expect(subject.apply(original, search, replace).strip).to eq(expected.strip)
+    end
   end
 end

From 9fafaa4171752ea63392da21320d96e597c1cb55 Mon Sep 17 00:00:00 2001
From: Sam Saffron <sam.saffron@gmail.com>
Date: Tue, 18 Feb 2025 14:33:29 +1100
Subject: [PATCH 2/9] try to add ... for elided lines, it appears to work well

---
 evals/lib/eval.rb                             | 44 +++++++++++++++++--
 lib/ai_bot/artifact_update_strategies/diff.rb | 32 ++++++++++++++
 lib/utils/diff_utils/simple_diff.rb           |  1 +
 spec/lib/utils/diff_utils/simple_diff_spec.rb |  2 +
 4 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb
index 05b90a15b..efbb5dcac 100644
--- a/evals/lib/eval.rb
+++ b/evals/lib/eval.rb
@@ -28,9 +28,11 @@ def initialize(path:)
     @expected_tool_call = @yaml[:expected_tool_call]
     @expected_tool_call.symbolize_keys! if @expected_tool_call
 
-    @args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?(
-      :path,
-    )
+    @args.each do |key, value|
+      if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
+        @args[key] = File.expand_path(File.join(File.dirname(path), value))
+      end
+    end
   end
 
   def run(llm:)
@@ -44,6 +46,8 @@ def run(llm:)
         image_to_text(llm, **args)
       when "prompt"
         prompt_call(llm, **args)
+      when "edit_artifact"
+        edit_artifact(llm, **args)
       end
 
     if expected_output
@@ -53,7 +57,7 @@ def run(llm:)
         { result: :fail, expected_output: expected_output, actual_output: result }
       end
     elsif expected_output_regex
-      if result.match?(expected_output_regex)
+      if result.to_s.match?(expected_output_regex)
         { result: :pass }
       else
         { result: :fail, expected_output: expected_output_regex, actual_output: result }
@@ -169,4 +173,36 @@ def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false)
     end
     result
   end
+
+  def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
+    css = File.read(css_path)
+    js = File.read(js_path)
+    html = File.read(html_path)
+    instructions = File.read(instructions_path)
+    artifact =
+      AiArtifact.create!(
+        css: css,
+        js: js,
+        html: html,
+        user_id: Discourse.system_user.id,
+        post_id: 1,
+        name: "eval artifact",
+      )
+
+    post = Post.new(topic_id: 1, id: 1)
+    DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
+      llm: llm.llm_model.to_llm,
+      post: post,
+      user: Discourse.system_user,
+      artifact: artifact,
+      artifact_version: nil,
+      instructions: instructions,
+    ).apply
+
+    version = artifact.versions.last
+    output = { css: version.css, js: version.js, html: version.html }
+
+    artifact.destroy
+    output
+  end
 end
diff --git a/lib/ai_bot/artifact_update_strategies/diff.rb b/lib/ai_bot/artifact_update_strategies/diff.rb
index 0919cc164..884c2f589 100644
--- a/lib/ai_bot/artifact_update_strategies/diff.rb
+++ b/lib/ai_bot/artifact_update_strategies/diff.rb
@@ -110,6 +110,8 @@ def system_prompt
             5. Keep changes minimal and focused
             6. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
             7. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit
+            8. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop
+            9. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines
 
             JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it:
             #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")}
@@ -154,6 +156,36 @@ def system_prompt
             .text { font-size: 16px; }
             >>>>>>> REPLACE
             [/CSS]
+
+            Example - Non contiguous search in CSS (replace all CSS with new CSS)
+
+            Original CSS:
+            [CSS]
+            body {
+              color: red;
+            }
+            .button {
+              color: blue;
+            }
+            .alert {
+              background-color: green;
+            }
+            [/CSS]
+
+            [CSS]
+            <<<<<<< SEARCH
+            body {
+            ...
+              background-color: green;
+            }
+            =======
+            body {
+              color: red;
+            }
+            >>>>>>> REPLACE
+
+            This will replace the entire CSS block with the new CSS block, given that the search block is non-contiguous and unambiguous.
+
           PROMPT
         end
 
diff --git a/lib/utils/diff_utils/simple_diff.rb b/lib/utils/diff_utils/simple_diff.rb
index 034bd44b2..88421a1a0 100644
--- a/lib/utils/diff_utils/simple_diff.rb
+++ b/lib/utils/diff_utils/simple_diff.rb
@@ -86,6 +86,7 @@ def non_contiguous_match_range(lines, search_lines)
           search_index = 0
 
           lines.each_with_index do |line, idx|
+            search_index += 1 if search_lines[search_index].strip == "..."
             if line.strip == search_lines[search_index].strip
               first_idx ||= idx
               last_idx = idx
diff --git a/spec/lib/utils/diff_utils/simple_diff_spec.rb b/spec/lib/utils/diff_utils/simple_diff_spec.rb
index eaf4a2e91..fe54e20b0 100644
--- a/spec/lib/utils/diff_utils/simple_diff_spec.rb
+++ b/spec/lib/utils/diff_utils/simple_diff_spec.rb
@@ -185,7 +185,9 @@
 
       search = <<~TEXT
         line1
+        ...
          line3
+        ...
         line1
       TEXT
 

From 3b752d0c24c6de08da48d4d21c4a2008328ed8a3 Mon Sep 17 00:00:00 2001
From: Sam Saffron <sam.saffron@gmail.com>
Date: Tue, 18 Feb 2025 15:18:14 +1100
Subject: [PATCH 3/9] some attempts at improving reliability

---
 evals/lib/eval.rb                             | 35 ++++++++++++++-----
 lib/ai_bot/artifact_update_strategies/diff.rb | 27 ++++++++++++--
 lib/utils/diff_utils/simple_diff.rb           |  5 ++-
 3 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb
index efbb5dcac..3b069fd8e 100644
--- a/evals/lib/eval.rb
+++ b/evals/lib/eval.rb
@@ -12,6 +12,15 @@ class DiscourseAi::Evals::Eval
               :expected_output_regex,
               :expected_tool_call
 
+  class EvalError < StandardError
+    attr_reader :context
+
+    def initialize(message, context)
+      super(message)
+      @context = context
+    end
+  end
+
   def initialize(path:)
     @yaml = YAML.load_file(path).symbolize_keys
     @path = path
@@ -78,6 +87,8 @@ def run(llm:)
     else
       { result: :unknown, actual_output: result }
     end
+  rescue EvalError
+    { result: :fail }
   end
 
   def print
@@ -190,14 +201,22 @@ def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
       )
 
     post = Post.new(topic_id: 1, id: 1)
-    DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
-      llm: llm.llm_model.to_llm,
-      post: post,
-      user: Discourse.system_user,
-      artifact: artifact,
-      artifact_version: nil,
-      instructions: instructions,
-    ).apply
+    diff =
+      DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
+        llm: llm.llm_model.to_llm,
+        post: post,
+        user: Discourse.system_user,
+        artifact: artifact,
+        artifact_version: nil,
+        instructions: instructions,
+      )
+    diff.apply
+
+    if diff.failed_searches.present?
+      puts "Eval Errors encountered"
+      p diff.failed_searches
+      raise EvalError.new("Failed to apply all changes", diff.failed_searches)
+    end
 
     version = artifact.versions.last
     output = { css: version.css, js: version.js, html: version.html }
diff --git a/lib/ai_bot/artifact_update_strategies/diff.rb b/lib/ai_bot/artifact_update_strategies/diff.rb
index 884c2f589..750dbec30 100644
--- a/lib/ai_bot/artifact_update_strategies/diff.rb
+++ b/lib/ai_bot/artifact_update_strategies/diff.rb
@@ -3,8 +3,15 @@ module DiscourseAi
   module AiBot
     module ArtifactUpdateStrategies
       class Diff < Base
+        attr_reader :failed_searches
+
         private
 
+        def initialize(**kwargs)
+          super
+          @failed_searches = []
+        end
+
         def build_prompt
           DiscourseAi::Completions::Prompt.new(
             system_prompt,
@@ -58,8 +65,10 @@ def apply_changes(changes)
                     block[:replace],
                   )
               rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError
+                @failed_searches << { section: section, search: block[:search] }
                 # TODO, we may need to inform caller here, LLM made a mistake which it
                 # should correct
+                puts "Failed to find search: #{block[:search]}"
               end
             end
             updated_content[section == :javascript ? :js : section] = content
@@ -112,6 +121,7 @@ def system_prompt
             7. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit
             8. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop
             9. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines
+            10. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block
 
             JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it:
             #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")}
@@ -157,9 +167,10 @@ def system_prompt
             >>>>>>> REPLACE
             [/CSS]
 
-            Example - Non contiguous search in CSS (replace all CSS with new CSS)
+            Example - Non contiguous search in CSS (replace most CSS with new CSS)
 
             Original CSS:
+
             [CSS]
             body {
               color: red;
@@ -170,6 +181,9 @@ def system_prompt
             .alert {
               background-color: green;
             }
+            .alert2 {
+              background-color: green;
+            }
             [/CSS]
 
             [CSS]
@@ -184,7 +198,16 @@ def system_prompt
             }
             >>>>>>> REPLACE
 
-            This will replace the entire CSS block with the new CSS block, given that the search block is non-contiguous and unambiguous.
+            RESULT:
+
+            [CSS]
+            body {
+              color: red;
+            }
+            .alert2 {
+              background-color: green;
+            }
+            [/CSS]
 
           PROMPT
         end
diff --git a/lib/utils/diff_utils/simple_diff.rb b/lib/utils/diff_utils/simple_diff.rb
index 88421a1a0..202ea881c 100644
--- a/lib/utils/diff_utils/simple_diff.rb
+++ b/lib/utils/diff_utils/simple_diff.rb
@@ -86,7 +86,10 @@ def non_contiguous_match_range(lines, search_lines)
           search_index = 0
 
           lines.each_with_index do |line, idx|
-            search_index += 1 if search_lines[search_index].strip == "..."
+            if search_lines[search_index].strip == "..."
+              search_index += 1
+              break if search_lines[search_index].nil?
+            end
             if line.strip == search_lines[search_index].strip
               first_idx ||= idx
               last_idx = idx

From 72559c76ab6563635b8deced5954b8db5ba21ccf Mon Sep 17 00:00:00 2001
From: Sam Saffron <sam.saffron@gmail.com>
Date: Wed, 19 Feb 2025 11:52:05 +1100
Subject: [PATCH 4/9] testing and prompt engineering

---
 app/models/ai_artifact_version.rb             |  13 ++
 lib/ai_bot/artifact_update_strategies/diff.rb |  57 +++--
 spec/fabricators/ai_artifact_fabricator.rb    |  19 ++
 .../artifact_update_strategies/diff_spec.rb   | 214 ++++++++++++++++++
 4 files changed, 287 insertions(+), 16 deletions(-)
 create mode 100644 spec/fabricators/ai_artifact_fabricator.rb
 create mode 100644 spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb

diff --git a/app/models/ai_artifact_version.rb b/app/models/ai_artifact_version.rb
index 94589a8f6..9a7d2b142 100644
--- a/app/models/ai_artifact_version.rb
+++ b/app/models/ai_artifact_version.rb
@@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base
   validates :html, length: { maximum: 65_535 }
   validates :css, length: { maximum: 65_535 }
   validates :js, length: { maximum: 65_535 }
+
+  # used when generating test cases
+  def write_to(path)
+    css_path = "#{path}/main.css"
+    html_path = "#{path}/main.html"
+    js_path = "#{path}/main.js"
+    instructions_path = "#{path}/instructions.txt"
+
+    File.write(css_path, css)
+    File.write(html_path, html)
+    File.write(js_path, js)
+    File.write(instructions_path, change_description)
+  end
 end
 
 # == Schema Information
diff --git a/lib/ai_bot/artifact_update_strategies/diff.rb b/lib/ai_bot/artifact_update_strategies/diff.rb
index 750dbec30..4a0ff28d2 100644
--- a/lib/ai_bot/artifact_update_strategies/diff.rb
+++ b/lib/ai_bot/artifact_update_strategies/diff.rb
@@ -58,12 +58,16 @@ def apply_changes(changes)
             content = source.public_send(section == :javascript ? :js : section)
             blocks.each do |block|
               begin
-                content =
-                  DiscourseAi::Utils::DiffUtils::SimpleDiff.apply(
-                    content,
-                    block[:search],
-                    block[:replace],
-                  )
+                if !block[:search]
+                  content = block[:replace]
+                else
+                  content =
+                    DiscourseAi::Utils::DiffUtils::SimpleDiff.apply(
+                      content,
+                      block[:search],
+                      block[:replace],
+                    )
+                end
               rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError
                 @failed_searches << { section: section, search: block[:search] }
                 # TODO, we may need to inform caller here, LLM made a mistake which it
@@ -85,7 +89,8 @@ def apply_changes(changes)
         private
 
         def extract_search_replace_blocks(content)
-          return nil if content.blank?
+          return nil if content.blank? || content.to_s.strip.downcase.match?(/^\(?no changes?\)?$/m)
+          return [{ replace: content }] if !content.match?(/<<+\s*SEARCH/)
 
           blocks = []
           remaining = content
@@ -116,25 +121,26 @@ def system_prompt
             2. DO NOT modify the markers or add spaces around them
             3. DO NOT add explanations or comments within sections
             4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes
-            5. Keep changes minimal and focused
-            6. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
-            7. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit
-            8. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop
-            9. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines
-            10. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block
+            5. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
+            6. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit
+            7. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop
+            8. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines
+            9. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block
+            10. Never mix a full section replacement with a search/replace block in the same section
+            11. ALWAYS skip sections you to not want to change, do not include them in the response
 
             JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it:
             #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")}
 
             Reply Format:
             [HTML]
-            (changes or empty if no changes)
+            (changes or empty if no changes or entire HTML)
             [/HTML]
             [CSS]
-            (changes or empty if no changes)
+            (changes or empty if no changes or entire CSS)
             [/CSS]
             [JavaScript]
-            (changes or empty if no changes)
+            (changes or empty if no changes or entire JavaScript)
             [/JavaScript]
 
             Example - Multiple changes in one file:
@@ -209,6 +215,25 @@ def system_prompt
             }
             [/CSS]
 
+            Example - full HTML replacement:
+
+            [HTML]
+            <div>something old</div>
+            <div>another somethin old</div>
+            [/HTML]
+
+            output:
+
+            [HTML]
+            <div>something new</div>
+            [/HTML]
+
+            result:
+            [HTML]
+            <div>something new</div>
+            [/HTML]
+
+
           PROMPT
         end
 
diff --git a/spec/fabricators/ai_artifact_fabricator.rb b/spec/fabricators/ai_artifact_fabricator.rb
new file mode 100644
index 000000000..49ce9ee28
--- /dev/null
+++ b/spec/fabricators/ai_artifact_fabricator.rb
@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+Fabricator(:ai_artifact) do
+  user
+  post
+  name { sequence(:name) { |i| "artifact_#{i}" } }
+  html { "<div>Test Content</div>" }
+  css { ".test { color: blue; }" }
+  js { "console.log('test');" }
+  metadata { { public: false } }
+end
+
+Fabricator(:ai_artifact_version) do
+  ai_artifact
+  version_number { sequence(:version_number) { |i| i } }
+  html { "<div>Version Content</div>" }
+  css { ".version { color: red; }" }
+  js { "console.log('version');" }
+  change_description { "Test change" }
+end
diff --git a/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb b/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb
new file mode 100644
index 000000000..c2426ad43
--- /dev/null
+++ b/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb
@@ -0,0 +1,214 @@
+# frozen_string_literal: true
+
+RSpec.describe DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff do
+  fab!(:user)
+  fab!(:post)
+  fab!(:artifact) { Fabricate(:ai_artifact) }
+  fab!(:llm_model)
+
+  let(:llm) { llm_model.to_llm }
+  let(:instructions) { "Update the button color to red" }
+
+  let(:strategy) do
+    described_class.new(
+      llm: llm,
+      post: post,
+      user: user,
+      artifact: artifact,
+      artifact_version: nil,
+      instructions: instructions,
+    )
+  end
+
+  describe "#apply" do
+    it "processes simple search/replace blocks" do
+      original_css = ".button { color: blue; }"
+      artifact.update!(css: original_css)
+
+      response = <<~RESPONSE
+        [CSS]
+        <<<<<<< SEARCH
+        .button { color: blue; }
+        =======
+        .button { color: red; }
+        >>>>>>> REPLACE
+        [/CSS]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      expect(artifact.versions.last.css).to eq(".button { color: red; }")
+    end
+
+    it "handles multiple search/replace blocks in the same section" do
+      original_css = <<~CSS
+        .button { color: blue; }
+        .text { font-size: 12px; }
+      CSS
+
+      artifact.update!(css: original_css)
+
+      response = <<~RESPONSE
+        [CSS]
+        <<<<<<< SEARCH
+        .button { color: blue; }
+        =======
+        .button { color: red; }
+        >>>>>>> REPLACE
+        <<<<<<< SEARCH
+        .text { font-size: 12px; }
+        =======
+        .text { font-size: 16px; }
+        >>>>>>> REPLACE
+        [/CSS]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      expected = <<~CSS.strip
+        .button { color: red; }
+        .text { font-size: 16px; }
+      CSS
+
+      expect(artifact.versions.last.css.strip).to eq(expected.strip)
+    end
+
+    it "handles non-contiguous search/replace using ..." do
+      original_css = <<~CSS
+        body {
+          color: red;
+        }
+        .button {
+          color: blue;
+        }
+        .alert {
+          background-color: green;
+        }
+      CSS
+
+      artifact.update!(css: original_css)
+
+      response = <<~RESPONSE
+        [CSS]
+        <<<<<<< SEARCH
+        body {
+        ...
+        background-color: green;
+        }
+        =======
+        body {
+          color: red;
+        }
+        >>>>>>> REPLACE
+        [/CSS]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      expect(artifact.versions.last.css).to eq("body {\n  color: red;\n}")
+    end
+
+    it "tracks failed searches" do
+      original_css = ".button { color: blue; }"
+      artifact.update!(css: original_css)
+
+      response = <<~RESPONSE
+        [CSS]
+        <<<<<<< SEARCH
+        .button { color: green; }
+        =======
+        .button { color: red; }
+        >>>>>>> REPLACE
+        [/CSS]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      expect(strategy.failed_searches).to contain_exactly(
+        { section: :css, search: ".button { color: green; }" },
+      )
+      expect(artifact.versions.last.css).to eq(original_css)
+    end
+
+    it "handles complete section replacements" do
+      original_html = "<div>old content</div>"
+      artifact.update!(html: original_html)
+
+      response = <<~RESPONSE
+        [HTML]
+        <div>new content</div>
+        [/HTML]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      expect(artifact.versions.last.html.strip).to eq("<div>new content</div>")
+    end
+
+    it "ignores empty or 'no changes' sections part 1" do
+      original = {
+        html: "<div>content</div>",
+        css: ".button { color: blue; }",
+        js: "console.log('test');",
+      }
+
+      artifact.update!(html: original[:html], css: original[:css], js: original[:js])
+
+      response = <<~RESPONSE
+        [HTML]
+        no changes
+        [/HTML]
+        [CSS]
+        (NO CHANGES)
+        [/CSS]
+        [JavaScript]
+        <<<<<<< SEARCH
+        console.log('test');
+        =======
+        console.log('(no changes)');
+        >>>>>>> REPLACE
+        [/JavaScript]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      version = artifact.versions.last
+      expect(version.html).to eq(original[:html])
+      expect(version.css).to eq(original[:css])
+      expect(version.js).to eq("console.log('(no changes)');")
+    end
+
+    it "ignores empty or 'no changes' section part 2" do
+      original = {
+        html: "<div>content</div>",
+        css: ".button { color: blue; }",
+        js: "console.log('test');",
+      }
+
+      artifact.update!(html: original[:html], css: original[:css], js: original[:js])
+
+      response = <<~RESPONSE
+        [HTML]
+        (no changes)
+        [/HTML]
+        [CSS]
+
+        [/CSS]
+        [JavaScript]
+        <<<<<<< SEARCH
+        console.log('test');
+        =======
+        console.log('updated');
+        >>>>>>> REPLACE
+        [/JavaScript]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      version = artifact.versions.last
+      expect(version.html).to eq(original[:html])
+      expect(version.css).to eq(original[:css])
+      expect(version.js).to eq("console.log('updated');")
+    end
+  end
+end

From 7e72a32ca0f7ff47261b175919cf844d05e96813 Mon Sep 17 00:00:00 2001
From: Sam Saffron <sam.saffron@gmail.com>
Date: Wed, 19 Feb 2025 13:43:01 +1100
Subject: [PATCH 5/9] improve eval output

---
 evals/lib/eval.rb   | 35 ++++++++++++++++++++++++++++++++---
 evals/lib/runner.rb | 11 +++++++++--
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb
index 3b069fd8e..0aa79c5b2 100644
--- a/evals/lib/eval.rb
+++ b/evals/lib/eval.rb
@@ -85,10 +85,10 @@ def run(llm:)
         { result: :pass }
       end
     else
-      { result: :unknown, actual_output: result }
+      { result: :pass }
     end
-  rescue EvalError
-    { result: :fail }
+  rescue EvalError => e
+    { result: :fail, message: e.message, context: e.context }
   end
 
   def print
@@ -218,10 +218,39 @@ def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
       raise EvalError.new("Failed to apply all changes", diff.failed_searches)
     end
 
+    raise EvalError.new("Invalid JS", artifact.js) if !valid_javascript?(artifact.js)
+
     version = artifact.versions.last
     output = { css: version.css, js: version.js, html: version.html }
 
     artifact.destroy
     output
   end
+
+  def valid_javascript?(str)
+    require "open3"
+
+    # Create a temporary file with the JavaScript code
+    Tempfile.create(%w[test .js]) do |f|
+      f.write(str)
+      f.flush
+
+      File.write("/tmp/test.js", str)
+
+      begin
+        Discourse::Utils.execute_command(
+          "node",
+          "--check",
+          f.path,
+          failure_message: "Invalid JavaScript syntax",
+          timeout: 30, # reasonable timeout in seconds
+        )
+        true
+      rescue Discourse::Utils::CommandError
+        false
+      end
+    end
+  rescue StandardError
+    false
+  end
 end
diff --git a/evals/lib/runner.rb b/evals/lib/runner.rb
index 87cc153e3..c155d2741 100644
--- a/evals/lib/runner.rb
+++ b/evals/lib/runner.rb
@@ -155,9 +155,16 @@ def run!
 
           if result[:result] == :fail
             puts "Failed 🔴"
-            puts "---- Expected ----\n#{result[:expected_output]}"
-            puts "---- Actual ----\n#{result[:actual_output]}"
+            puts "Error: #{result[:message]}" if result[:message]
+            if result[:expected_output] && result[:actual_output]
+              puts "---- Expected ----\n#{result[:expected_output]}"
+              puts "---- Actual ----\n#{result[:actual_output]}"
+            end
             logger.error("Evaluation failed with LLM: #{llm.name}")
+            logger.error("Error: #{result[:message]}") if result[:message]
+            logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
+            logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
+            logger.error("Context: #{result[:context]}") if result[:context]
           elsif result[:result] == :pass
             puts "Passed 🟢"
             logger.info("Evaluation passed with LLM: #{llm.name}")

From 6fe881149c71363734870e415b8a7481ad0aa460 Mon Sep 17 00:00:00 2001
From: Sam Saffron <sam.saffron@gmail.com>
Date: Wed, 19 Feb 2025 13:49:51 +1100
Subject: [PATCH 6/9] move error handling

---
 evals/lib/eval.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb
index 0aa79c5b2..26ec5d73b 100644
--- a/evals/lib/eval.rb
+++ b/evals/lib/eval.rb
@@ -218,9 +218,9 @@ def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
       raise EvalError.new("Failed to apply all changes", diff.failed_searches)
     end
 
-    raise EvalError.new("Invalid JS", artifact.js) if !valid_javascript?(artifact.js)
-
     version = artifact.versions.last
+    raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js)
+
     output = { css: version.css, js: version.js, html: version.html }
 
     artifact.destroy

From 6b3a57a9c6e3848fb1fbb040e8fb043ce18b91ec Mon Sep 17 00:00:00 2001
From: Sam Saffron <sam.saffron@gmail.com>
Date: Wed, 19 Feb 2025 14:29:34 +1100
Subject: [PATCH 7/9] llm as a judge

---
 evals/lib/eval.rb   | 65 ++++++++++++++++++++++++++++++++++++++++++---
 evals/lib/runner.rb |  2 ++
 2 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb
index 26ec5d73b..280ab8713 100644
--- a/evals/lib/eval.rb
+++ b/evals/lib/eval.rb
@@ -10,7 +10,8 @@ class DiscourseAi::Evals::Eval
               :vision,
               :expected_output,
               :expected_output_regex,
-              :expected_tool_call
+              :expected_tool_call,
+              :judge
 
   class EvalError < StandardError
     attr_reader :context
@@ -36,6 +37,8 @@ def initialize(path:)
       Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
     @expected_tool_call = @yaml[:expected_tool_call]
     @expected_tool_call.symbolize_keys! if @expected_tool_call
+    @judge = @yaml[:judge]
+    @judge.symbolize_keys! if @judge
 
     @args.each do |key, value|
       if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
@@ -84,6 +87,8 @@ def run(llm:)
       else
         { result: :pass }
       end
+    elsif judge
+      judge_result(result)
     else
       { result: :pass }
     end
@@ -111,14 +116,68 @@ def to_json
 
   private
 
-  def helper(llm, input:, name:)
+  def judge_result(result)
+    prompt = judge[:prompt].dup
+    prompt.sub!("{{output}}", result)
+    prompt.sub!("{{input}}", args[:input])
+
+    prompt += <<~SUFFIX
+
+      Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.
+
+      example output:
+
+      [RATING]10[/RATING] perfect output
+
+      example output:
+
+      [RATING]5[/RATING]
+
+      the following failed to preserve... etc...
+    SUFFIX
+
+    judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first
+
+    DiscourseAi::Completions::Prompt.new(
+      "You are an expert judge tasked at testing LLM outputs.",
+      messages: [{ type: :user, content: prompt }],
+    )
+
+    result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
+
+    if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
+      rating = rating[1].to_i
+    end
+
+    if rating.to_i >= judge[:pass_rating]
+      { result: :pass }
+    else
+      {
+        result: :fail,
+        message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}",
+        context: result,
+      }
+    end
+  end
+
+  def helper(llm, input:, name:, locale: nil)
     completion_prompt = CompletionPrompt.find_by(name: name)
     helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
+    user = Discourse.system_user
+    if locale
+      user = User.new
+      class << user
+        attr_accessor :effective_locale
+      end
+
+      user.effective_locale = locale
+      user.admin = true
+    end
     result =
       helper.generate_and_send_prompt(
         completion_prompt,
         input,
-        current_user = Discourse.system_user,
+        current_user = user,
         _force_default_locale = false,
       )
 
diff --git a/evals/lib/runner.rb b/evals/lib/runner.rb
index c155d2741..86fa34b27 100644
--- a/evals/lib/runner.rb
+++ b/evals/lib/runner.rb
@@ -156,6 +156,8 @@ def run!
           if result[:result] == :fail
             puts "Failed 🔴"
             puts "Error: #{result[:message]}" if result[:message]
+            # this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
+            #puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
             if result[:expected_output] && result[:actual_output]
               puts "---- Expected ----\n#{result[:expected_output]}"
               puts "---- Actual ----\n#{result[:actual_output]}"

From 6e8620b4cef2ba55836d7db0b275f4461303a64d Mon Sep 17 00:00:00 2001
From: Sam Saffron <sam.saffron@gmail.com>
Date: Wed, 19 Feb 2025 14:46:36 +1100
Subject: [PATCH 8/9] fix spec

---
 spec/lib/modules/ai_bot/tools/update_artifact_spec.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb b/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb
index 17833a0bc..c9834d251 100644
--- a/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb
+++ b/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb
@@ -262,7 +262,7 @@
     [/CSS]
 
     [JavaScript]
-    nothing to do
+    no changes
     [/JavaScript]
 
     LLMs like to say nonsense that we can ignore here as well

From 3a7085b3d9bb305f1f2e88a374b93b9a5df0d33e Mon Sep 17 00:00:00 2001
From: Sam Saffron <sam.saffron@gmail.com>
Date: Wed, 19 Feb 2025 14:52:11 +1100
Subject: [PATCH 9/9] small note on evals

---
 README.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/README.md b/README.md
index cd7ae38ad..6bbc5a8bf 100644
--- a/README.md
+++ b/README.md
@@ -3,3 +3,26 @@
 **Plugin Summary**
 
 For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco
+
+### Evals
+
+The directory `evals` contains AI evals for the Discourse AI plugin.
+
+To run them use: 
+
+cd evals
+./run --help
+
+```
+Usage: evals/run [options]
+    -e, --eval NAME                  Name of the evaluation to run
+        --list-models                List models
+    -m, --model NAME                 Model to evaluate (will eval all models if not specified)
+    -l, --list                       List evals
+```
+
+To run evals you will need to configure API keys in your environment:
+
+OPENAI_API_KEY=your_openai_api_key
+ANTHROPIC_API_KEY=your_anthropic_api_key
+GEMINI_API_KEY=your_gemini_api_key