@@ -10,7 +10,17 @@ class DiscourseAi::Evals::Eval
1010 :vision ,
1111 :expected_output ,
1212 :expected_output_regex ,
13- :expected_tool_call
13+ :expected_tool_call ,
14+ :judge
15+
16+ class EvalError < StandardError
17+ attr_reader :context
18+
19+ def initialize ( message , context )
20+ super ( message )
21+ @context = context
22+ end
23+ end
1424
1525 def initialize ( path :)
1626 @yaml = YAML . load_file ( path ) . symbolize_keys
@@ -27,10 +37,14 @@ def initialize(path:)
2737 Regexp . new ( @expected_output_regex , Regexp ::MULTILINE ) if @expected_output_regex
2838 @expected_tool_call = @yaml [ :expected_tool_call ]
2939 @expected_tool_call . symbolize_keys! if @expected_tool_call
40+ @judge = @yaml [ :judge ]
41+ @judge . symbolize_keys! if @judge
3042
31- @args [ :path ] = File . expand_path ( File . join ( File . dirname ( path ) , @args [ :path ] ) ) if @args &.key? (
32- :path ,
33- )
43+ @args . each do |key , value |
44+ if ( key . to_s . include? ( "_path" ) || key . to_s == "path" ) && value . is_a? ( String )
45+ @args [ key ] = File . expand_path ( File . join ( File . dirname ( path ) , value ) )
46+ end
47+ end
3448 end
3549
3650 def run ( llm :)
@@ -44,6 +58,8 @@ def run(llm:)
4458 image_to_text ( llm , **args )
4559 when "prompt"
4660 prompt_call ( llm , **args )
61+ when "edit_artifact"
62+ edit_artifact ( llm , **args )
4763 end
4864
4965 if expected_output
@@ -53,7 +69,7 @@ def run(llm:)
5369 { result : :fail , expected_output : expected_output , actual_output : result }
5470 end
5571 elsif expected_output_regex
56- if result . match? ( expected_output_regex )
72+ if result . to_s . match? ( expected_output_regex )
5773 { result : :pass }
5874 else
5975 { result : :fail , expected_output : expected_output_regex , actual_output : result }
@@ -71,9 +87,13 @@ def run(llm:)
7187 else
7288 { result : :pass }
7389 end
90+ elsif judge
91+ judge_result ( result )
7492 else
75- { result : :unknown , actual_output : result }
93+ { result : :pass }
7694 end
95+ rescue EvalError => e
96+ { result : :fail , message : e . message , context : e . context }
7797 end
7898
7999 def print
@@ -96,14 +116,68 @@ def to_json
96116
97117 private
98118
99- def helper ( llm , input :, name :)
119+ def judge_result ( result )
120+ prompt = judge [ :prompt ] . dup
121+ prompt . sub! ( "{{output}}" , result )
122+ prompt . sub! ( "{{input}}" , args [ :input ] )
123+
124+ prompt += <<~SUFFIX
125+
126+ Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.
127+
128+ example output:
129+
130+ [RATING]10[/RATING] perfect output
131+
132+ example output:
133+
134+ [RATING]5[/RATING]
135+
136+ the following failed to preserve... etc...
137+ SUFFIX
138+
139+ judge_llm = DiscourseAi ::Evals ::Llm . choose ( judge [ :llm ] ) . first
140+
141+ DiscourseAi ::Completions ::Prompt . new (
142+ "You are an expert judge tasked at testing LLM outputs." ,
143+ messages : [ { type : :user , content : prompt } ] ,
144+ )
145+
146+ result = judge_llm . llm_model . to_llm . generate ( prompt , user : Discourse . system_user )
147+
148+ if rating = result . match ( %r{\[ RATING\] (\d +)\[ /RATING\] } )
149+ rating = rating [ 1 ] . to_i
150+ end
151+
152+ if rating . to_i >= judge [ :pass_rating ]
153+ { result : :pass }
154+ else
155+ {
156+ result : :fail ,
157+ message : "LLM Rating below threshold, it was #{ rating } , expecting #{ judge [ :pass_rating ] } " ,
158+ context : result ,
159+ }
160+ end
161+ end
162+
163+ def helper ( llm , input :, name :, locale : nil )
100164 completion_prompt = CompletionPrompt . find_by ( name : name )
101165 helper = DiscourseAi ::AiHelper ::Assistant . new ( helper_llm : llm . llm_proxy )
166+ user = Discourse . system_user
167+ if locale
168+ user = User . new
169+ class << user
170+ attr_accessor :effective_locale
171+ end
172+
173+ user . effective_locale = locale
174+ user . admin = true
175+ end
102176 result =
103177 helper . generate_and_send_prompt (
104178 completion_prompt ,
105179 input ,
106- current_user = Discourse . system_user ,
180+ current_user = user ,
107181 _force_default_locale = false ,
108182 )
109183
@@ -169,4 +243,73 @@ def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false)
169243 end
170244 result
171245 end
246+
247+ def edit_artifact ( llm , css_path :, js_path :, html_path :, instructions_path :)
248+ css = File . read ( css_path )
249+ js = File . read ( js_path )
250+ html = File . read ( html_path )
251+ instructions = File . read ( instructions_path )
252+ artifact =
253+ AiArtifact . create! (
254+ css : css ,
255+ js : js ,
256+ html : html ,
257+ user_id : Discourse . system_user . id ,
258+ post_id : 1 ,
259+ name : "eval artifact" ,
260+ )
261+
262+ post = Post . new ( topic_id : 1 , id : 1 )
263+ diff =
264+ DiscourseAi ::AiBot ::ArtifactUpdateStrategies ::Diff . new (
265+ llm : llm . llm_model . to_llm ,
266+ post : post ,
267+ user : Discourse . system_user ,
268+ artifact : artifact ,
269+ artifact_version : nil ,
270+ instructions : instructions ,
271+ )
272+ diff . apply
273+
274+ if diff . failed_searches . present?
275+ puts "Eval Errors encountered"
276+ p diff . failed_searches
277+ raise EvalError . new ( "Failed to apply all changes" , diff . failed_searches )
278+ end
279+
280+ version = artifact . versions . last
281+ raise EvalError . new ( "Invalid JS" , version . js ) if !valid_javascript? ( version . js )
282+
283+ output = { css : version . css , js : version . js , html : version . html }
284+
285+ artifact . destroy
286+ output
287+ end
288+
289+ def valid_javascript? ( str )
290+ require "open3"
291+
292+ # Create a temporary file with the JavaScript code
293+ Tempfile . create ( %w[ test .js ] ) do |f |
294+ f . write ( str )
295+ f . flush
296+
297+ File . write ( "/tmp/test.js" , str )
298+
299+ begin
300+ Discourse ::Utils . execute_command (
301+ "node" ,
302+ "--check" ,
303+ f . path ,
304+ failure_message : "Invalid JavaScript syntax" ,
305+ timeout : 30 , # reasonable timeout in seconds
306+ )
307+ true
308+ rescue Discourse ::Utils ::CommandError
309+ false
310+ end
311+ end
312+ rescue StandardError
313+ false
314+ end
172315end
0 commit comments