@@ -57,13 +57,51 @@ def run(llm:)
5757 when "image_to_text"
5858 image_to_text ( llm , **args )
5959 when "prompt"
60- prompt_call ( llm , **args )
60+ DiscourseAi :: Evals :: PromptEvaluator . new ( llm ) . prompt_call ( **args )
6161 when "edit_artifact"
6262 edit_artifact ( llm , **args )
6363 when "summarization"
6464 summarization ( llm , **args )
6565 end
6666
67+ classify_results ( result )
68+ rescue EvalError => e
69+ { result : :fail , message : e . message , context : e . context }
70+ end
71+
72+ def print
73+ puts "#{ id } : #{ description } "
74+ end
75+
76+ def to_json
77+ {
78+ type : @type ,
79+ path : @path ,
80+ name : @name ,
81+ description : @description ,
82+ id : @id ,
83+ args : @args ,
84+ vision : @vision ,
85+ expected_output : @expected_output ,
86+ expected_output_regex : @expected_output_regex ,
87+ } . compact
88+ end
89+
90+ private
91+
92+ # @param result [String, Array<Hash>] the result of the eval, either
93+ # "llm response" or [{ result: "llm response", other_attrs: here }]
94+ # @return [Array<Hash>] an array of hashes with the result classified
95+ # as pass or fail, along with extra attributes
96+ def classify_results ( result )
97+ if result . is_a? ( Array )
98+ result . each { |r | r . merge! ( classify_result_pass_fail ( r ) ) }
99+ else
100+ [ classify_result_pass_fail ( result ) ]
101+ end
102+ end
103+
104+ def classify_result_pass_fail ( result )
67105 if expected_output
68106 if result == expected_output
69107 { result : :pass }
@@ -94,34 +132,17 @@ def run(llm:)
94132 else
95133 { result : :pass }
96134 end
97- rescue EvalError => e
98- { result : :fail , message : e . message , context : e . context }
99135 end
100136
101- def print
102- puts "#{ id } : #{ description } "
103- end
104-
105- def to_json
106- {
107- type : @type ,
108- path : @path ,
109- name : @name ,
110- description : @description ,
111- id : @id ,
112- args : @args ,
113- vision : @vision ,
114- expected_output : @expected_output ,
115- expected_output_regex : @expected_output_regex ,
116- } . compact
117- end
118-
119- private
120-
121137 def judge_result ( result )
122138 prompt = judge [ :prompt ] . dup
123- prompt . sub! ( "{{output}}" , result )
124- args . each { |key , value | prompt . sub! ( "{{#{ key } }}" , value . to_s ) }
139+ if result . is_a? ( String )
140+ prompt . sub! ( "{{output}}" , result )
141+ args . each { |key , value | prompt . sub! ( "{{#{ key } }}" , value . to_s ) }
142+ else
143+ prompt . sub! ( "{{output}}" , result [ :result ] )
144+ result . each { |key , value | prompt . sub! ( "{{#{ key } }}" , value . to_s ) }
145+ end
125146
126147 prompt += <<~SUFFIX
127148
@@ -220,36 +241,6 @@ def pdf_to_text(llm, path:)
220241 upload . destroy if upload
221242 end
222243
223- def prompt_call ( llm , system_prompt :, message :, temperature : nil , tools : nil , stream : false )
224- if tools
225- tools . each do |tool |
226- tool . symbolize_keys!
227- tool [ :parameters ] . symbolize_keys! if tool [ :parameters ]
228- end
229- end
230- prompt =
231- DiscourseAi ::Completions ::Prompt . new (
232- system_prompt ,
233- messages : [ { type : :user , content : message } ] ,
234- )
235-
236- prompt . tools = tools if tools
237-
238- result = nil
239- if stream
240- result = [ ]
241- llm
242- . llm_model
243- . to_llm
244- . generate ( prompt , user : Discourse . system_user , temperature : temperature ) do |partial |
245- result << partial
246- end
247- else
248- result = llm . llm_model . to_llm . generate ( prompt , user : Discourse . system_user )
249- end
250- result
251- end
252-
253244 def edit_artifact ( llm , css_path :, js_path :, html_path :, instructions_path :)
254245 css = File . read ( css_path )
255246 js = File . read ( js_path )
0 commit comments