From 8e2ffaed23d40b22d4d7d48cdc53f0c22232c615 Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Mon, 17 Feb 2025 19:42:22 +1100 Subject: [PATCH 1/9] WIP: smarter edit algorithm --- lib/ai_bot/artifact_update_strategies/diff.rb | 10 +++--- lib/utils/diff_utils/simple_diff.rb | 33 +++++++++++++++++-- spec/lib/utils/diff_utils/simple_diff_spec.rb | 26 +++++++++++++++ 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/lib/ai_bot/artifact_update_strategies/diff.rb b/lib/ai_bot/artifact_update_strategies/diff.rb index fc2c23459..0919cc164 100644 --- a/lib/ai_bot/artifact_update_strategies/diff.rb +++ b/lib/ai_bot/artifact_update_strategies/diff.rb @@ -98,16 +98,18 @@ def system_prompt 1. Use EXACTLY this format for changes: <<<<<<< SEARCH - (exact code to find) + (first line of code to replace) + (other lines of code to avoid ambiguity) + (last line of code to replace) ======= (replacement code) >>>>>>> REPLACE 2. DO NOT modify the markers or add spaces around them 3. DO NOT add explanations or comments within sections 4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes - 5. Ensure search text matches EXACTLY - partial matches will fail - 6. Keep changes minimal and focused - 7. HTML should not include , , or tags, it is injected into a template + 5. Keep changes minimal and focused + 6. HTML should not include , , or tags, it is injected into a template + 7. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it: #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")} diff --git a/lib/utils/diff_utils/simple_diff.rb b/lib/utils/diff_utils/simple_diff.rb index e03dfef34..034bd44b2 100644 --- a/lib/utils/diff_utils/simple_diff.rb +++ b/lib/utils/diff_utils/simple_diff.rb @@ -26,6 +26,8 @@ def apply(content, search, replace) lines = content.split("\n") search_lines = search.split("\n") + ### TODO implement me + # 1. Try exact matching match_positions = find_matches(lines, search_lines) { |line, search_line| line == search_line } @@ -38,7 +40,17 @@ def apply(content, search, replace) end end - # 3. Try fuzzy matching + # 3. Try non-contiguous line based stripped matching + if match_positions.empty? + if range = non_contiguous_match_range(lines, search_lines) + first_match, last_match = range + lines.slice!(first_match, last_match - first_match + 1) + lines.insert(first_match, *replace.split("\n")) + return lines.join("\n") + end + end + + # 4. Try fuzzy matching if match_positions.empty? match_positions = find_matches(lines, search_lines) do |line, search_line| @@ -46,7 +58,7 @@ def apply(content, search, replace) end end - # 4. Try block matching as last resort + # 5. Try block matching as last resort if match_positions.empty? if block_matches = find_block_matches(content, search) return replace_blocks(content, block_matches, replace) @@ -68,6 +80,23 @@ def apply(content, search, replace) private + def non_contiguous_match_range(lines, search_lines) + first_idx = nil + last_idx = nil + search_index = 0 + + lines.each_with_index do |line, idx| + if line.strip == search_lines[search_index].strip + first_idx ||= idx + last_idx = idx + search_index += 1 + return first_idx, last_idx if search_index == search_lines.length + end + end + + nil + end + def find_matches(lines, search_lines) matches = [] max_index = lines.length - search_lines.length diff --git a/spec/lib/utils/diff_utils/simple_diff_spec.rb b/spec/lib/utils/diff_utils/simple_diff_spec.rb index 8a29028f2..eaf4a2e91 100644 --- a/spec/lib/utils/diff_utils/simple_diff_spec.rb +++ b/spec/lib/utils/diff_utils/simple_diff_spec.rb @@ -171,5 +171,31 @@ expect(subject.apply(content, search, replace).strip).to eq(expected.strip) end + + it "handles missing lines in search" do + original = <<~TEXT + line1 + line2 + line3 + line4 + line5 + line1 + line2 + TEXT + + search = <<~TEXT + line1 + line3 + line1 + TEXT + + replace = "" + + expected = <<~TEXT + line2 + TEXT + + expect(subject.apply(original, search, replace).strip).to eq(expected.strip) + end end end From 9fafaa4171752ea63392da21320d96e597c1cb55 Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Tue, 18 Feb 2025 14:33:29 +1100 Subject: [PATCH 2/9] try to add ... for elided lines, it appears to work well --- evals/lib/eval.rb | 44 +++++++++++++++++-- lib/ai_bot/artifact_update_strategies/diff.rb | 32 ++++++++++++++ lib/utils/diff_utils/simple_diff.rb | 1 + spec/lib/utils/diff_utils/simple_diff_spec.rb | 2 + 4 files changed, 75 insertions(+), 4 deletions(-) diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index 05b90a15b..efbb5dcac 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -28,9 +28,11 @@ def initialize(path:) @expected_tool_call = @yaml[:expected_tool_call] @expected_tool_call.symbolize_keys! if @expected_tool_call - @args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?( - :path, - ) + @args.each do |key, value| + if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String) + @args[key] = File.expand_path(File.join(File.dirname(path), value)) + end + end end def run(llm:) @@ -44,6 +46,8 @@ def run(llm:) image_to_text(llm, **args) when "prompt" prompt_call(llm, **args) + when "edit_artifact" + edit_artifact(llm, **args) end if expected_output @@ -53,7 +57,7 @@ def run(llm:) { result: :fail, expected_output: expected_output, actual_output: result } end elsif expected_output_regex - if result.match?(expected_output_regex) + if result.to_s.match?(expected_output_regex) { result: :pass } else { result: :fail, expected_output: expected_output_regex, actual_output: result } @@ -169,4 +173,36 @@ def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false) end result end + + def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:) + css = File.read(css_path) + js = File.read(js_path) + html = File.read(html_path) + instructions = File.read(instructions_path) + artifact = + AiArtifact.create!( + css: css, + js: js, + html: html, + user_id: Discourse.system_user.id, + post_id: 1, + name: "eval artifact", + ) + + post = Post.new(topic_id: 1, id: 1) + DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new( + llm: llm.llm_model.to_llm, + post: post, + user: Discourse.system_user, + artifact: artifact, + artifact_version: nil, + instructions: instructions, + ).apply + + version = artifact.versions.last + output = { css: version.css, js: version.js, html: version.html } + + artifact.destroy + output + end end diff --git a/lib/ai_bot/artifact_update_strategies/diff.rb b/lib/ai_bot/artifact_update_strategies/diff.rb index 0919cc164..884c2f589 100644 --- a/lib/ai_bot/artifact_update_strategies/diff.rb +++ b/lib/ai_bot/artifact_update_strategies/diff.rb @@ -110,6 +110,8 @@ def system_prompt 5. Keep changes minimal and focused 6. HTML should not include , , or tags, it is injected into a template 7. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit + 8. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop + 9. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it: #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")} @@ -154,6 +156,36 @@ def system_prompt .text { font-size: 16px; } >>>>>>> REPLACE [/CSS] + + Example - Non contiguous search in CSS (replace all CSS with new CSS) + + Original CSS: + [CSS] + body { + color: red; + } + .button { + color: blue; + } + .alert { + background-color: green; + } + [/CSS] + + [CSS] + <<<<<<< SEARCH + body { + ... + background-color: green; + } + ======= + body { + color: red; + } + >>>>>>> REPLACE + + This will replace the entire CSS block with the new CSS block, given that the search block is non-contiguous and unambiguous. + PROMPT end diff --git a/lib/utils/diff_utils/simple_diff.rb b/lib/utils/diff_utils/simple_diff.rb index 034bd44b2..88421a1a0 100644 --- a/lib/utils/diff_utils/simple_diff.rb +++ b/lib/utils/diff_utils/simple_diff.rb @@ -86,6 +86,7 @@ def non_contiguous_match_range(lines, search_lines) search_index = 0 lines.each_with_index do |line, idx| + search_index += 1 if search_lines[search_index].strip == "..." if line.strip == search_lines[search_index].strip first_idx ||= idx last_idx = idx diff --git a/spec/lib/utils/diff_utils/simple_diff_spec.rb b/spec/lib/utils/diff_utils/simple_diff_spec.rb index eaf4a2e91..fe54e20b0 100644 --- a/spec/lib/utils/diff_utils/simple_diff_spec.rb +++ b/spec/lib/utils/diff_utils/simple_diff_spec.rb @@ -185,7 +185,9 @@ search = <<~TEXT line1 + ... line3 + ... line1 TEXT From 3b752d0c24c6de08da48d4d21c4a2008328ed8a3 Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Tue, 18 Feb 2025 15:18:14 +1100 Subject: [PATCH 3/9] some attempts at improving reliability --- evals/lib/eval.rb | 35 ++++++++++++++----- lib/ai_bot/artifact_update_strategies/diff.rb | 27 ++++++++++++-- lib/utils/diff_utils/simple_diff.rb | 5 ++- 3 files changed, 56 insertions(+), 11 deletions(-) diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index efbb5dcac..3b069fd8e 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -12,6 +12,15 @@ class DiscourseAi::Evals::Eval :expected_output_regex, :expected_tool_call + class EvalError < StandardError + attr_reader :context + + def initialize(message, context) + super(message) + @context = context + end + end + def initialize(path:) @yaml = YAML.load_file(path).symbolize_keys @path = path @@ -78,6 +87,8 @@ def run(llm:) else { result: :unknown, actual_output: result } end + rescue EvalError + { result: :fail } end def print @@ -190,14 +201,22 @@ def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:) ) post = Post.new(topic_id: 1, id: 1) - DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new( - llm: llm.llm_model.to_llm, - post: post, - user: Discourse.system_user, - artifact: artifact, - artifact_version: nil, - instructions: instructions, - ).apply + diff = + DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new( + llm: llm.llm_model.to_llm, + post: post, + user: Discourse.system_user, + artifact: artifact, + artifact_version: nil, + instructions: instructions, + ) + diff.apply + + if diff.failed_searches.present? + puts "Eval Errors encountered" + p diff.failed_searches + raise EvalError.new("Failed to apply all changes", diff.failed_searches) + end version = artifact.versions.last output = { css: version.css, js: version.js, html: version.html } diff --git a/lib/ai_bot/artifact_update_strategies/diff.rb b/lib/ai_bot/artifact_update_strategies/diff.rb index 884c2f589..750dbec30 100644 --- a/lib/ai_bot/artifact_update_strategies/diff.rb +++ b/lib/ai_bot/artifact_update_strategies/diff.rb @@ -3,8 +3,15 @@ module DiscourseAi module AiBot module ArtifactUpdateStrategies class Diff < Base + attr_reader :failed_searches + private + def initialize(**kwargs) + super + @failed_searches = [] + end + def build_prompt DiscourseAi::Completions::Prompt.new( system_prompt, @@ -58,8 +65,10 @@ def apply_changes(changes) block[:replace], ) rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError + @failed_searches << { section: section, search: block[:search] } # TODO, we may need to inform caller here, LLM made a mistake which it # should correct + puts "Failed to find search: #{block[:search]}" end end updated_content[section == :javascript ? :js : section] = content @@ -112,6 +121,7 @@ def system_prompt 7. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit 8. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop 9. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines + 10. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it: #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")} @@ -157,9 +167,10 @@ def system_prompt >>>>>>> REPLACE [/CSS] - Example - Non contiguous search in CSS (replace all CSS with new CSS) + Example - Non contiguous search in CSS (replace most CSS with new CSS) Original CSS: + [CSS] body { color: red; @@ -170,6 +181,9 @@ def system_prompt .alert { background-color: green; } + .alert2 { + background-color: green; + } [/CSS] [CSS] @@ -184,7 +198,16 @@ def system_prompt } >>>>>>> REPLACE - This will replace the entire CSS block with the new CSS block, given that the search block is non-contiguous and unambiguous. + RESULT: + + [CSS] + body { + color: red; + } + .alert2 { + background-color: green; + } + [/CSS] PROMPT end diff --git a/lib/utils/diff_utils/simple_diff.rb b/lib/utils/diff_utils/simple_diff.rb index 88421a1a0..202ea881c 100644 --- a/lib/utils/diff_utils/simple_diff.rb +++ b/lib/utils/diff_utils/simple_diff.rb @@ -86,7 +86,10 @@ def non_contiguous_match_range(lines, search_lines) search_index = 0 lines.each_with_index do |line, idx| - search_index += 1 if search_lines[search_index].strip == "..." + if search_lines[search_index].strip == "..." + search_index += 1 + break if search_lines[search_index].nil? + end if line.strip == search_lines[search_index].strip first_idx ||= idx last_idx = idx From 72559c76ab6563635b8deced5954b8db5ba21ccf Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Wed, 19 Feb 2025 11:52:05 +1100 Subject: [PATCH 4/9] testing and prompt engineering --- app/models/ai_artifact_version.rb | 13 ++ lib/ai_bot/artifact_update_strategies/diff.rb | 57 +++-- spec/fabricators/ai_artifact_fabricator.rb | 19 ++ .../artifact_update_strategies/diff_spec.rb | 214 ++++++++++++++++++ 4 files changed, 287 insertions(+), 16 deletions(-) create mode 100644 spec/fabricators/ai_artifact_fabricator.rb create mode 100644 spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb diff --git a/app/models/ai_artifact_version.rb b/app/models/ai_artifact_version.rb index 94589a8f6..9a7d2b142 100644 --- a/app/models/ai_artifact_version.rb +++ b/app/models/ai_artifact_version.rb @@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base validates :html, length: { maximum: 65_535 } validates :css, length: { maximum: 65_535 } validates :js, length: { maximum: 65_535 } + + # used when generating test cases + def write_to(path) + css_path = "#{path}/main.css" + html_path = "#{path}/main.html" + js_path = "#{path}/main.js" + instructions_path = "#{path}/instructions.txt" + + File.write(css_path, css) + File.write(html_path, html) + File.write(js_path, js) + File.write(instructions_path, change_description) + end end # == Schema Information diff --git a/lib/ai_bot/artifact_update_strategies/diff.rb b/lib/ai_bot/artifact_update_strategies/diff.rb index 750dbec30..4a0ff28d2 100644 --- a/lib/ai_bot/artifact_update_strategies/diff.rb +++ b/lib/ai_bot/artifact_update_strategies/diff.rb @@ -58,12 +58,16 @@ def apply_changes(changes) content = source.public_send(section == :javascript ? :js : section) blocks.each do |block| begin - content = - DiscourseAi::Utils::DiffUtils::SimpleDiff.apply( - content, - block[:search], - block[:replace], - ) + if !block[:search] + content = block[:replace] + else + content = + DiscourseAi::Utils::DiffUtils::SimpleDiff.apply( + content, + block[:search], + block[:replace], + ) + end rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError @failed_searches << { section: section, search: block[:search] } # TODO, we may need to inform caller here, LLM made a mistake which it @@ -85,7 +89,8 @@ def apply_changes(changes) private def extract_search_replace_blocks(content) - return nil if content.blank? + return nil if content.blank? || content.to_s.strip.downcase.match?(/^\(?no changes?\)?$/m) + return [{ replace: content }] if !content.match?(/<<+\s*SEARCH/) blocks = [] remaining = content @@ -116,25 +121,26 @@ def system_prompt 2. DO NOT modify the markers or add spaces around them 3. DO NOT add explanations or comments within sections 4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes - 5. Keep changes minimal and focused - 6. HTML should not include , , or tags, it is injected into a template - 7. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit - 8. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop - 9. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines - 10. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block + 5. HTML should not include , , or tags, it is injected into a template + 6. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit + 7. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop + 8. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines + 9. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block + 10. Never mix a full section replacement with a search/replace block in the same section + 11. ALWAYS skip sections you to not want to change, do not include them in the response JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it: #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")} Reply Format: [HTML] - (changes or empty if no changes) + (changes or empty if no changes or entire HTML) [/HTML] [CSS] - (changes or empty if no changes) + (changes or empty if no changes or entire CSS) [/CSS] [JavaScript] - (changes or empty if no changes) + (changes or empty if no changes or entire JavaScript) [/JavaScript] Example - Multiple changes in one file: @@ -209,6 +215,25 @@ def system_prompt } [/CSS] + Example - full HTML replacement: + + [HTML] +
something old
+
another somethin old
+ [/HTML] + + output: + + [HTML] +
something new
+ [/HTML] + + result: + [HTML] +
something new
+ [/HTML] + + PROMPT end diff --git a/spec/fabricators/ai_artifact_fabricator.rb b/spec/fabricators/ai_artifact_fabricator.rb new file mode 100644 index 000000000..49ce9ee28 --- /dev/null +++ b/spec/fabricators/ai_artifact_fabricator.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true +Fabricator(:ai_artifact) do + user + post + name { sequence(:name) { |i| "artifact_#{i}" } } + html { "
Test Content
" } + css { ".test { color: blue; }" } + js { "console.log('test');" } + metadata { { public: false } } +end + +Fabricator(:ai_artifact_version) do + ai_artifact + version_number { sequence(:version_number) { |i| i } } + html { "
Version Content
" } + css { ".version { color: red; }" } + js { "console.log('version');" } + change_description { "Test change" } +end diff --git a/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb b/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb new file mode 100644 index 000000000..c2426ad43 --- /dev/null +++ b/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb @@ -0,0 +1,214 @@ +# frozen_string_literal: true + +RSpec.describe DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff do + fab!(:user) + fab!(:post) + fab!(:artifact) { Fabricate(:ai_artifact) } + fab!(:llm_model) + + let(:llm) { llm_model.to_llm } + let(:instructions) { "Update the button color to red" } + + let(:strategy) do + described_class.new( + llm: llm, + post: post, + user: user, + artifact: artifact, + artifact_version: nil, + instructions: instructions, + ) + end + + describe "#apply" do + it "processes simple search/replace blocks" do + original_css = ".button { color: blue; }" + artifact.update!(css: original_css) + + response = <<~RESPONSE + [CSS] + <<<<<<< SEARCH + .button { color: blue; } + ======= + .button { color: red; } + >>>>>>> REPLACE + [/CSS] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + expect(artifact.versions.last.css).to eq(".button { color: red; }") + end + + it "handles multiple search/replace blocks in the same section" do + original_css = <<~CSS + .button { color: blue; } + .text { font-size: 12px; } + CSS + + artifact.update!(css: original_css) + + response = <<~RESPONSE + [CSS] + <<<<<<< SEARCH + .button { color: blue; } + ======= + .button { color: red; } + >>>>>>> REPLACE + <<<<<<< SEARCH + .text { font-size: 12px; } + ======= + .text { font-size: 16px; } + >>>>>>> REPLACE + [/CSS] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + expected = <<~CSS.strip + .button { color: red; } + .text { font-size: 16px; } + CSS + + expect(artifact.versions.last.css.strip).to eq(expected.strip) + end + + it "handles non-contiguous search/replace using ..." do + original_css = <<~CSS + body { + color: red; + } + .button { + color: blue; + } + .alert { + background-color: green; + } + CSS + + artifact.update!(css: original_css) + + response = <<~RESPONSE + [CSS] + <<<<<<< SEARCH + body { + ... + background-color: green; + } + ======= + body { + color: red; + } + >>>>>>> REPLACE + [/CSS] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + expect(artifact.versions.last.css).to eq("body {\n color: red;\n}") + end + + it "tracks failed searches" do + original_css = ".button { color: blue; }" + artifact.update!(css: original_css) + + response = <<~RESPONSE + [CSS] + <<<<<<< SEARCH + .button { color: green; } + ======= + .button { color: red; } + >>>>>>> REPLACE + [/CSS] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + expect(strategy.failed_searches).to contain_exactly( + { section: :css, search: ".button { color: green; }" }, + ) + expect(artifact.versions.last.css).to eq(original_css) + end + + it "handles complete section replacements" do + original_html = "
old content
" + artifact.update!(html: original_html) + + response = <<~RESPONSE + [HTML] +
new content
+ [/HTML] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + expect(artifact.versions.last.html.strip).to eq("
new content
") + end + + it "ignores empty or 'no changes' sections part 1" do + original = { + html: "
content
", + css: ".button { color: blue; }", + js: "console.log('test');", + } + + artifact.update!(html: original[:html], css: original[:css], js: original[:js]) + + response = <<~RESPONSE + [HTML] + no changes + [/HTML] + [CSS] + (NO CHANGES) + [/CSS] + [JavaScript] + <<<<<<< SEARCH + console.log('test'); + ======= + console.log('(no changes)'); + >>>>>>> REPLACE + [/JavaScript] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + version = artifact.versions.last + expect(version.html).to eq(original[:html]) + expect(version.css).to eq(original[:css]) + expect(version.js).to eq("console.log('(no changes)');") + end + + it "ignores empty or 'no changes' section part 2" do + original = { + html: "
content
", + css: ".button { color: blue; }", + js: "console.log('test');", + } + + artifact.update!(html: original[:html], css: original[:css], js: original[:js]) + + response = <<~RESPONSE + [HTML] + (no changes) + [/HTML] + [CSS] + + [/CSS] + [JavaScript] + <<<<<<< SEARCH + console.log('test'); + ======= + console.log('updated'); + >>>>>>> REPLACE + [/JavaScript] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + version = artifact.versions.last + expect(version.html).to eq(original[:html]) + expect(version.css).to eq(original[:css]) + expect(version.js).to eq("console.log('updated');") + end + end +end From 7e72a32ca0f7ff47261b175919cf844d05e96813 Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Wed, 19 Feb 2025 13:43:01 +1100 Subject: [PATCH 5/9] improve eval output --- evals/lib/eval.rb | 35 ++++++++++++++++++++++++++++++++--- evals/lib/runner.rb | 11 +++++++++-- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index 3b069fd8e..0aa79c5b2 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -85,10 +85,10 @@ def run(llm:) { result: :pass } end else - { result: :unknown, actual_output: result } + { result: :pass } end - rescue EvalError - { result: :fail } + rescue EvalError => e + { result: :fail, message: e.message, context: e.context } end def print @@ -218,10 +218,39 @@ def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:) raise EvalError.new("Failed to apply all changes", diff.failed_searches) end + raise EvalError.new("Invalid JS", artifact.js) if !valid_javascript?(artifact.js) + version = artifact.versions.last output = { css: version.css, js: version.js, html: version.html } artifact.destroy output end + + def valid_javascript?(str) + require "open3" + + # Create a temporary file with the JavaScript code + Tempfile.create(%w[test .js]) do |f| + f.write(str) + f.flush + + File.write("/tmp/test.js", str) + + begin + Discourse::Utils.execute_command( + "node", + "--check", + f.path, + failure_message: "Invalid JavaScript syntax", + timeout: 30, # reasonable timeout in seconds + ) + true + rescue Discourse::Utils::CommandError + false + end + end + rescue StandardError + false + end end diff --git a/evals/lib/runner.rb b/evals/lib/runner.rb index 87cc153e3..c155d2741 100644 --- a/evals/lib/runner.rb +++ b/evals/lib/runner.rb @@ -155,9 +155,16 @@ def run! if result[:result] == :fail puts "Failed 🔴" - puts "---- Expected ----\n#{result[:expected_output]}" - puts "---- Actual ----\n#{result[:actual_output]}" + puts "Error: #{result[:message]}" if result[:message] + if result[:expected_output] && result[:actual_output] + puts "---- Expected ----\n#{result[:expected_output]}" + puts "---- Actual ----\n#{result[:actual_output]}" + end logger.error("Evaluation failed with LLM: #{llm.name}") + logger.error("Error: #{result[:message]}") if result[:message] + logger.error("Expected: #{result[:expected_output]}") if result[:expected_output] + logger.error("Actual: #{result[:actual_output]}") if result[:actual_output] + logger.error("Context: #{result[:context]}") if result[:context] elsif result[:result] == :pass puts "Passed 🟢" logger.info("Evaluation passed with LLM: #{llm.name}") From 6fe881149c71363734870e415b8a7481ad0aa460 Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Wed, 19 Feb 2025 13:49:51 +1100 Subject: [PATCH 6/9] move error handling --- evals/lib/eval.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index 0aa79c5b2..26ec5d73b 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -218,9 +218,9 @@ def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:) raise EvalError.new("Failed to apply all changes", diff.failed_searches) end - raise EvalError.new("Invalid JS", artifact.js) if !valid_javascript?(artifact.js) - version = artifact.versions.last + raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js) + output = { css: version.css, js: version.js, html: version.html } artifact.destroy From 6b3a57a9c6e3848fb1fbb040e8fb043ce18b91ec Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Wed, 19 Feb 2025 14:29:34 +1100 Subject: [PATCH 7/9] llm as a judge --- evals/lib/eval.rb | 65 ++++++++++++++++++++++++++++++++++++++++++--- evals/lib/runner.rb | 2 ++ 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index 26ec5d73b..280ab8713 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -10,7 +10,8 @@ class DiscourseAi::Evals::Eval :vision, :expected_output, :expected_output_regex, - :expected_tool_call + :expected_tool_call, + :judge class EvalError < StandardError attr_reader :context @@ -36,6 +37,8 @@ def initialize(path:) Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex @expected_tool_call = @yaml[:expected_tool_call] @expected_tool_call.symbolize_keys! if @expected_tool_call + @judge = @yaml[:judge] + @judge.symbolize_keys! if @judge @args.each do |key, value| if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String) @@ -84,6 +87,8 @@ def run(llm:) else { result: :pass } end + elsif judge + judge_result(result) else { result: :pass } end @@ -111,14 +116,68 @@ def to_json private - def helper(llm, input:, name:) + def judge_result(result) + prompt = judge[:prompt].dup + prompt.sub!("{{output}}", result) + prompt.sub!("{{input}}", args[:input]) + + prompt += <<~SUFFIX + + Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible. + + example output: + + [RATING]10[/RATING] perfect output + + example output: + + [RATING]5[/RATING] + + the following failed to preserve... etc... + SUFFIX + + judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first + + DiscourseAi::Completions::Prompt.new( + "You are an expert judge tasked at testing LLM outputs.", + messages: [{ type: :user, content: prompt }], + ) + + result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user) + + if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]}) + rating = rating[1].to_i + end + + if rating.to_i >= judge[:pass_rating] + { result: :pass } + else + { + result: :fail, + message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}", + context: result, + } + end + end + + def helper(llm, input:, name:, locale: nil) completion_prompt = CompletionPrompt.find_by(name: name) helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy) + user = Discourse.system_user + if locale + user = User.new + class << user + attr_accessor :effective_locale + end + + user.effective_locale = locale + user.admin = true + end result = helper.generate_and_send_prompt( completion_prompt, input, - current_user = Discourse.system_user, + current_user = user, _force_default_locale = false, ) diff --git a/evals/lib/runner.rb b/evals/lib/runner.rb index c155d2741..86fa34b27 100644 --- a/evals/lib/runner.rb +++ b/evals/lib/runner.rb @@ -156,6 +156,8 @@ def run! if result[:result] == :fail puts "Failed 🔴" puts "Error: #{result[:message]}" if result[:message] + # this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful + #puts "Context: #{result[:context].to_s[0..2000]}" if result[:context] if result[:expected_output] && result[:actual_output] puts "---- Expected ----\n#{result[:expected_output]}" puts "---- Actual ----\n#{result[:actual_output]}" From 6e8620b4cef2ba55836d7db0b275f4461303a64d Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Wed, 19 Feb 2025 14:46:36 +1100 Subject: [PATCH 8/9] fix spec --- spec/lib/modules/ai_bot/tools/update_artifact_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb b/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb index 17833a0bc..c9834d251 100644 --- a/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb +++ b/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb @@ -262,7 +262,7 @@ [/CSS] [JavaScript] - nothing to do + no changes [/JavaScript] LLMs like to say nonsense that we can ignore here as well From 3a7085b3d9bb305f1f2e88a374b93b9a5df0d33e Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Wed, 19 Feb 2025 14:52:11 +1100 Subject: [PATCH 9/9] small note on evals --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index cd7ae38ad..6bbc5a8bf 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,26 @@ **Plugin Summary** For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco + +### Evals + +The directory `evals` contains AI evals for the Discourse AI plugin. + +To run them use: + +cd evals +./run --help + +``` +Usage: evals/run [options] + -e, --eval NAME Name of the evaluation to run + --list-models List models + -m, --model NAME Model to evaluate (will eval all models if not specified) + -l, --list List evals +``` + +To run evals you will need to configure API keys in your environment: + +OPENAI_API_KEY=your_openai_api_key +ANTHROPIC_API_KEY=your_anthropic_api_key +GEMINI_API_KEY=your_gemini_api_key