@@ -10,7 +10,8 @@ class DiscourseAi::Evals::Eval
1010 :vision ,
1111 :expected_output ,
1212 :expected_output_regex ,
13- :expected_tool_call
13+ :expected_tool_call ,
14+ :judge
1415
1516 class EvalError < StandardError
1617 attr_reader :context
@@ -36,6 +37,8 @@ def initialize(path:)
3637 Regexp . new ( @expected_output_regex , Regexp ::MULTILINE ) if @expected_output_regex
3738 @expected_tool_call = @yaml [ :expected_tool_call ]
3839 @expected_tool_call . symbolize_keys! if @expected_tool_call
40+ @judge = @yaml [ :judge ]
41+ @judge . symbolize_keys! if @judge
3942
4043 @args . each do |key , value |
4144 if ( key . to_s . include? ( "_path" ) || key . to_s == "path" ) && value . is_a? ( String )
@@ -84,6 +87,8 @@ def run(llm:)
8487 else
8588 { result : :pass }
8689 end
90+ elsif judge
91+ judge_result ( result )
8792 else
8893 { result : :pass }
8994 end
@@ -111,14 +116,68 @@ def to_json
111116
112117 private
113118
114- def helper ( llm , input :, name :)
119+ def judge_result ( result )
120+ prompt = judge [ :prompt ] . dup
121+ prompt . sub! ( "{{output}}" , result )
122+ prompt . sub! ( "{{input}}" , args [ :input ] )
123+
124+ prompt += <<~SUFFIX
125+
126+ Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.
127+
128+ example output:
129+
130+ [RATING]10[/RATING] perfect output
131+
132+ example output:
133+
134+ [RATING]5[/RATING]
135+
136+ the following failed to preserve... etc...
137+ SUFFIX
138+
139+ judge_llm = DiscourseAi ::Evals ::Llm . choose ( judge [ :llm ] ) . first
140+
141+ DiscourseAi ::Completions ::Prompt . new (
142+ "You are an expert judge tasked at testing LLM outputs." ,
143+ messages : [ { type : :user , content : prompt } ] ,
144+ )
145+
146+ result = judge_llm . llm_model . to_llm . generate ( prompt , user : Discourse . system_user )
147+
148+ if rating = result . match ( %r{\[ RATING\] (\d +)\[ /RATING\] } )
149+ rating = rating [ 1 ] . to_i
150+ end
151+
152+ if rating . to_i >= judge [ :pass_rating ]
153+ { result : :pass }
154+ else
155+ {
156+ result : :fail ,
157+ message : "LLM Rating below threshold, it was #{ rating } , expecting #{ judge [ :pass_rating ] } " ,
158+ context : result ,
159+ }
160+ end
161+ end
162+
163+ def helper ( llm , input :, name :, locale : nil )
115164 completion_prompt = CompletionPrompt . find_by ( name : name )
116165 helper = DiscourseAi ::AiHelper ::Assistant . new ( helper_llm : llm . llm_proxy )
166+ user = Discourse . system_user
167+ if locale
168+ user = User . new
169+ class << user
170+ attr_accessor :effective_locale
171+ end
172+
173+ user . effective_locale = locale
174+ user . admin = true
175+ end
117176 result =
118177 helper . generate_and_send_prompt (
119178 completion_prompt ,
120179 input ,
121- current_user = Discourse . system_user ,
180+ current_user = user ,
122181 _force_default_locale = false ,
123182 )
124183
0 commit comments