Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit e4603b5

Browse files
committed
work in progress, eval
1 parent 95572ac commit e4603b5

File tree

5 files changed

+223
-3
lines changed

5 files changed

+223
-3
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
id: simple_proofread
2+
name: Simple Proofread
3+
description: A simple proofread evaluation
4+
type: helper
5+
args:
6+
name: proofread
7+
input: "Haw are you doing todayy?"
8+
expected_output: "How are you doing today?"
9+

evals/lib/llm.rb

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseAi::Evals
4+
end
5+
6+
class DiscourseAi::Evals::Llm
7+
CONFIGS = {
8+
"gpt-4o" => {
9+
display_name: "GPT-4o",
10+
name: "gpt-4o",
11+
tokenizer: "DiscourseAi::Tokenizer::OpenAiTokenizer",
12+
api_key_env: "OPENAI_API_KEY",
13+
provider: "open_ai",
14+
url: "https://api.openai.com/v1/chat/completions",
15+
max_prompt_tokens: 131_072,
16+
vision_enabled: true,
17+
},
18+
"gpt-4o-mini" => {
19+
display_name: "GPT-4o-mini",
20+
name: "gpt-4o-mini",
21+
tokenizer: "DiscourseAi::Tokenizer::OpenAiTokenizer",
22+
api_key_env: "OPENAI_API_KEY",
23+
provider: "open_ai",
24+
url: "https://api.openai.com/v1/chat/completions",
25+
max_prompt_tokens: 131_072,
26+
vision_enabled: true,
27+
},
28+
"claude-3.5-haiku" => {
29+
display_name: "Claude 3.5 Haiku",
30+
name: "claude-3-5-haiku-latest",
31+
tokenizer: "DiscourseAi::Tokenizer::AnthropicTokenizer",
32+
api_key_env: "ANTHROPIC_API_KEY",
33+
provider: "anthropic",
34+
url: "https://api.anthropic.com/v1/messages",
35+
max_prompt_tokens: 200_000,
36+
vision_enabled: false,
37+
},
38+
"claude-3.5-sonnet" => {
39+
display_name: "Claude 3.5 Sonnet",
40+
name: "claude-3-5-sonnet-latest",
41+
tokenizer: "DiscourseAi::Tokenizer::AnthropicTokenizer",
42+
api_key_env: "ANTHROPIC_API_KEY",
43+
provider: "anthropic",
44+
url: "https://api.anthropic.com/v1/messages",
45+
max_prompt_tokens: 200_000,
46+
vision_enabled: true,
47+
},
48+
"gemini-2.0-flash" => {
49+
display_name: "Gemini 2.0 Flash",
50+
name: "gemini-2-0-flash",
51+
tokenizer: "DiscourseAi::Tokenizer::GeminiTokenizer",
52+
api_key_env: "GEMINI_API_KEY",
53+
provider: "google",
54+
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash",
55+
max_prompt_tokens: 1_000_000,
56+
vision_enabled: true,
57+
},
58+
}
59+
60+
def self.choose(config_name)
61+
if CONFIGS[config_name].nil?
62+
CONFIGS.keys.map { |config_name| new(config_name) }
63+
elsif !CONFIGS.include?(config_name)
64+
raise "Invalid llm"
65+
else
66+
[new(config_name)]
67+
end
68+
end
69+
70+
attr_reader :llm_model
71+
attr_reader :llm
72+
73+
def initialize(config_name)
74+
config = CONFIGS[config_name].dup
75+
api_key_env = config.delete(:api_key_env)
76+
if !ENV[api_key_env]
77+
raise "Missing API key for #{config_name}, should be set via #{api_key_env}"
78+
end
79+
80+
config[:api_key] = ENV[api_key_env]
81+
@llm_model = LlmModel.new(config)
82+
@llm = DiscourseAi::Completions::Llm.proxy(@llm_model)
83+
end
84+
85+
def eval(type:, args:, expected_output: nil)
86+
result =
87+
case type
88+
when "helper"
89+
helper(**args)
90+
end
91+
92+
if expected_output && result == expected_output
93+
{ result: :pass }
94+
else
95+
{ result: :fail, expected_output: expected_output, actual_output: result }
96+
end
97+
end
98+
99+
def name
100+
@llm_model.display_name
101+
end
102+
103+
private
104+
105+
def helper(input:, name:)
106+
completion_prompt = CompletionPrompt.find_by(name: name)
107+
helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: @llm)
108+
result =
109+
helper.generate_and_send_prompt(
110+
completion_prompt,
111+
input,
112+
current_user = Discourse.system_user,
113+
_force_default_locale = false,
114+
)
115+
116+
result[:suggestions].first
117+
end
118+
end

evals/run

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/usr/bin/env ruby
2+
# frozen_string_literal: true
3+
4+
discourse_path = File.expand_path(File.join(File.dirname(__FILE__), "../../.."))
5+
# rubocop:disable Discourse/NoChdir
6+
Dir.chdir(discourse_path)
7+
# rubocop:enable Discourse/NoChdir
8+
9+
require "/home/sam/Source/discourse/config/environment"
10+
require_relative "lib/llm"
11+
12+
# Set up command line argument parsing
13+
require "optparse"
14+
ENV["DISCOURSE_AI_NO_DEBUG"] = "1"
15+
16+
options = { eval_name: nil, model: nil, output_dir: File.join(discourse_path, "tmp", "evals") }
17+
18+
OptionParser
19+
.new do |opts|
20+
opts.banner = "Usage: evals/run [options]"
21+
22+
opts.on("-e", "--eval NAME", "Name of the evaluation to run") do |eval_name|
23+
options[:eval_name] = eval_name
24+
end
25+
26+
opts.on("-m", "--model NAME", "Model to evaluate") { |model| options[:model] = model }
27+
28+
opts.on("-o", "--output-dir DIR", "Directory for evaluation results") do |dir|
29+
options[:output_dir] = dir
30+
end
31+
end
32+
.parse!
33+
34+
# Ensure output directory exists
35+
FileUtils.mkdir_p(options[:output_dir])
36+
37+
# Load and run the specified evaluation
38+
if options[:eval_name].nil?
39+
puts "Error: Must specify an evaluation name with -e or --eval"
40+
exit 1
41+
end
42+
43+
cases_path = File.join(__dir__, "cases")
44+
45+
cases = Dir.glob(File.join(cases_path, "*/*.yml")).map { |f| [File.basename(f, ".yml"), f] }.to_h
46+
47+
if !cases.keys.include?(options[:eval_name])
48+
puts "Error: Unknown evaluation '#{options[:eval_name]}'"
49+
exit 1
50+
end
51+
52+
llms = DiscourseAi::Evals::Llm.choose(options[:model])
53+
54+
if llms.empty?
55+
puts "Error: Unknown model '#{options[:model]}'"
56+
exit 1
57+
end
58+
59+
eval_info = YAML.load_file(cases[options[:eval_name]]).symbolize_keys
60+
61+
puts "Running evaluation '#{options[:eval_name]}'"
62+
63+
llms.each do |llm|
64+
eval =
65+
llm.eval(
66+
type: eval_info[:type],
67+
args: eval_info[:args].symbolize_keys,
68+
expected_output: eval_info[:expected_output],
69+
)
70+
71+
print "#{llm.name}: "
72+
if eval[:result] == :fail
73+
puts "Error: #{eval.inspect}"
74+
elsif eval[:result] == :pass
75+
puts "Passed 🟢"
76+
else
77+
STDERR.puts "Error: Unknown result #{eval.inspect}"
78+
end
79+
end

lib/ai_helper/assistant.rb

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,20 @@ def self.clear_prompt_cache!
1313
prompt_cache.flush!
1414
end
1515

16+
def initialize(helper_llm: nil, image_caption_llm: nil)
17+
@helper_llm = helper_llm
18+
@image_caption_llm = image_caption_llm
19+
end
20+
21+
def helper_llm
22+
@helper_llm || DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model)
23+
end
24+
25+
def image_caption_llm
26+
@image_caption_llm ||
27+
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model)
28+
end
29+
1630
def available_prompts(user)
1731
key = "prompt_cache_#{I18n.locale}"
1832
self
@@ -115,7 +129,7 @@ def localize_prompt!(prompt, user = nil, force_default_locale = false)
115129
end
116130

117131
def generate_prompt(completion_prompt, input, user, force_default_locale = false, &block)
118-
llm = DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model)
132+
llm = helper_llm
119133
prompt = completion_prompt.messages_with_input(input)
120134
localize_prompt!(prompt, user, force_default_locale)
121135

@@ -182,7 +196,7 @@ def generate_image_caption(upload, user)
182196
)
183197

184198
raw_caption =
185-
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
199+
image_caption_llm.generate(
186200
prompt,
187201
user: user,
188202
max_tokens: 1024,

lib/completions/endpoints/base.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def perform_completion!(
223223
log.duration_msecs = (Time.now - start_time) * 1000
224224
log.save!
225225
LlmQuota.log_usage(@llm_model, user, log.request_tokens, log.response_tokens)
226-
if Rails.env.development?
226+
if Rails.env.development? && !ENV["DISCOURSE_AI_NO_DEBUG"]
227227
puts "#{self.class.name}: request_tokens #{log.request_tokens} response_tokens #{log.response_tokens}"
228228
end
229229
end

0 commit comments

Comments
 (0)