diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index 693959ec0..4e84ff328 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -57,13 +57,51 @@ def run(llm:) when "image_to_text" image_to_text(llm, **args) when "prompt" - prompt_call(llm, **args) + DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(**args) when "edit_artifact" edit_artifact(llm, **args) when "summarization" summarization(llm, **args) end + classify_results(result) + rescue EvalError => e + { result: :fail, message: e.message, context: e.context } + end + + def print + puts "#{id}: #{description}" + end + + def to_json + { + type: @type, + path: @path, + name: @name, + description: @description, + id: @id, + args: @args, + vision: @vision, + expected_output: @expected_output, + expected_output_regex: @expected_output_regex, + }.compact + end + + private + + # @param result [String, Array] the result of the eval, either + # "llm response" or [{ result: "llm response", other_attrs: here }] + # @return [Array] an array of hashes with the result classified + # as pass or fail, along with extra attributes + def classify_results(result) + if result.is_a?(Array) + result.each { |r| r.merge!(classify_result_pass_fail(r)) } + else + [classify_result_pass_fail(result)] + end + end + + def classify_result_pass_fail(result) if expected_output if result == expected_output { result: :pass } @@ -94,34 +132,17 @@ def run(llm:) else { result: :pass } end - rescue EvalError => e - { result: :fail, message: e.message, context: e.context } end - def print - puts "#{id}: #{description}" - end - - def to_json - { - type: @type, - path: @path, - name: @name, - description: @description, - id: @id, - args: @args, - vision: @vision, - expected_output: @expected_output, - expected_output_regex: @expected_output_regex, - }.compact - end - - private - def judge_result(result) prompt = judge[:prompt].dup - prompt.sub!("{{output}}", result) - args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) } + if result.is_a?(String) + prompt.sub!("{{output}}", result) + args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) } + else + prompt.sub!("{{output}}", result[:result]) + result.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) } + end prompt += <<~SUFFIX @@ -220,36 +241,6 @@ def pdf_to_text(llm, path:) upload.destroy if upload end - def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false) - if tools - tools.each do |tool| - tool.symbolize_keys! - tool[:parameters].symbolize_keys! if tool[:parameters] - end - end - prompt = - DiscourseAi::Completions::Prompt.new( - system_prompt, - messages: [{ type: :user, content: message }], - ) - - prompt.tools = tools if tools - - result = nil - if stream - result = [] - llm - .llm_model - .to_llm - .generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial| - result << partial - end - else - result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user) - end - result - end - def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:) css = File.read(css_path) js = File.read(js_path) diff --git a/evals/lib/prompt_evaluator.rb b/evals/lib/prompt_evaluator.rb new file mode 100644 index 000000000..d6ca3330c --- /dev/null +++ b/evals/lib/prompt_evaluator.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +class DiscourseAi::Evals::PromptEvaluator + def initialize(llm) + @llm = llm.llm_model.to_llm + end + + def prompt_call(prompts:, messages: nil, temperature: nil, tools: nil, stream: false) + tools = symbolize_tools(tools) + total = prompts.size * messages.size + count = 0 + puts "" + + prompts.flat_map do |prompt| + messages.map do |content| + count += 1 + print "\rProcessing #{count}/#{total}" + + c_prompt = + DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: }]) + c_prompt.tools = tools if tools + result = { prompt:, message: content } + result[:result] = generate_result(c_prompt, temperature, stream) + result + end + end + ensure + print "\r\033[K" + end + + private + + def generate_result(c_prompt, temperature, stream) + if stream + stream_result = [] + @llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) do |partial| + stream_result << partial + end + stream_result + else + @llm.generate(c_prompt, user: Discourse.system_user, temperature: temperature) + end + end + + def symbolize_tools(tools) + return nil if tools.nil? + + tools.map do |tool| + { name: tool["name"], parameters: tool["parameters"]&.transform_keys(&:to_sym) }.compact + end + end +end diff --git a/evals/lib/runner.rb b/evals/lib/runner.rb index 86fa34b27..e72fa1e0f 100644 --- a/evals/lib/runner.rb +++ b/evals/lib/runner.rb @@ -148,31 +148,33 @@ def run! structured_logger.step("Evaluating with LLM: #{llm.name}") do |step| logger.info("Evaluating with LLM: #{llm.name}") print "#{llm.name}: " - result = @eval.run(llm: llm) - - step[:args] = result - step[:cname] = result[:result] == :pass ? :good : :bad - - if result[:result] == :fail - puts "Failed 🔴" - puts "Error: #{result[:message]}" if result[:message] - # this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful - #puts "Context: #{result[:context].to_s[0..2000]}" if result[:context] - if result[:expected_output] && result[:actual_output] - puts "---- Expected ----\n#{result[:expected_output]}" - puts "---- Actual ----\n#{result[:actual_output]}" + results = @eval.run(llm: llm) + + results.each do |result| + step[:args] = result + step[:cname] = result[:result] == :pass ? :good : :bad + + if result[:result] == :fail + puts "Failed 🔴" + puts "Error: #{result[:message]}" if result[:message] + # this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful + #puts "Context: #{result[:context].to_s[0..2000]}" if result[:context] + if result[:expected_output] && result[:actual_output] + puts "---- Expected ----\n#{result[:expected_output]}" + puts "---- Actual ----\n#{result[:actual_output]}" + end + logger.error("Evaluation failed with LLM: #{llm.name}") + logger.error("Error: #{result[:message]}") if result[:message] + logger.error("Expected: #{result[:expected_output]}") if result[:expected_output] + logger.error("Actual: #{result[:actual_output]}") if result[:actual_output] + logger.error("Context: #{result[:context]}") if result[:context] + elsif result[:result] == :pass + puts "Passed 🟢" + logger.info("Evaluation passed with LLM: #{llm.name}") + else + STDERR.puts "Error: Unknown result #{eval.inspect}" + logger.error("Unknown result: #{eval.inspect}") end - logger.error("Evaluation failed with LLM: #{llm.name}") - logger.error("Error: #{result[:message]}") if result[:message] - logger.error("Expected: #{result[:expected_output]}") if result[:expected_output] - logger.error("Actual: #{result[:actual_output]}") if result[:actual_output] - logger.error("Context: #{result[:context]}") if result[:context] - elsif result[:result] == :pass - puts "Passed 🟢" - logger.info("Evaluation passed with LLM: #{llm.name}") - else - STDERR.puts "Error: Unknown result #{eval.inspect}" - logger.error("Unknown result: #{eval.inspect}") end end end diff --git a/evals/run b/evals/run index a4a396976..8c133eb22 100755 --- a/evals/run +++ b/evals/run @@ -6,6 +6,7 @@ require_relative "lib/llm" require_relative "lib/cli" require_relative "lib/runner" require_relative "lib/eval" +require_relative "lib/prompt_evaluator" options = DiscourseAi::Evals::Cli.parse_options!