@@ -103,108 +103,191 @@ async def evaluate(
103
103
MetricNotFoundError: If metric name not found
104
104
ParseError: If unable to parse model response
105
105
"""
106
+ # Resolve metric if string
107
+ resolved_metric = self ._resolve_metric (metric )
108
+
109
+ # Handle model-specific metrics early
110
+ if isinstance (resolved_metric , ModelSpecificMetric ):
111
+ return await self ._evaluate_model_specific_metric (
112
+ resolved_metric , content , sampling_params
113
+ )
114
+
115
+ # Process normal evaluation
116
+ evaluation_params = self ._prepare_evaluation_params (
117
+ resolved_metric , criteria , rubric , scale , examples ,
118
+ system_prompt , template_engine
119
+ )
120
+
121
+ # Process templates
122
+ processed_params = self ._process_templates (
123
+ evaluation_params , template_vars , input , context
124
+ )
125
+
126
+ # Build and execute evaluation
127
+ return await self ._execute_evaluation (
128
+ content , processed_params , sampling_params , ** kwargs
129
+ )
130
+
131
+ def _resolve_metric (self , metric : Union [Metric , str , None ]) -> Optional [Metric ]:
132
+ """Resolve metric string to Metric object."""
106
133
if metric and isinstance (metric , str ):
107
- metric : Metric = self .get_metric (metric )
108
-
109
- # Handle model-specific metrics
110
- if isinstance (metric , ModelSpecificMetric ):
111
- if isinstance (content , dict ):
112
- raise InvalidInputError ("Model-specific metrics only support string and list of dicts as content for now" )
113
-
114
- if isinstance (content , list ) and len (content ) == 0 :
115
- raise InvalidInputError ("Conversation content cannot be an empty list." )
116
-
117
- is_conversation = (
118
- isinstance (content , list ) and
119
- all (isinstance (msg , dict ) and "role" in msg and "content" in msg for msg in content )
134
+ return self .get_metric (metric )
135
+ return metric
136
+
137
+ async def _evaluate_model_specific_metric (
138
+ self ,
139
+ metric : ModelSpecificMetric ,
140
+ content : Union [str , List [Dict [str , str ]]],
141
+ sampling_params : Optional [Dict [str , Any ]]
142
+ ) -> EvaluationResult :
143
+ """Handle evaluation for model-specific metrics."""
144
+ # Validate content for model-specific metrics
145
+ if isinstance (content , dict ):
146
+ raise InvalidInputError (
147
+ "Model-specific metrics only support string and list of dicts as content for now"
120
148
)
121
- if isinstance (content , list ) and not is_conversation :
122
- raise InvalidInputError ("Invalid content structure for conversation. Please provide a list of dicts with role and content fields." )
123
-
124
-
125
- # Skip ALL our formatting
126
- if is_conversation :
127
- messages = content
128
- else :
129
- messages = [{"role" : "user" , "content" : content }]
149
+
150
+ if isinstance (content , list ) and len (content ) == 0 :
151
+ raise InvalidInputError ("Conversation content cannot be an empty list." )
152
+
153
+ # Validate conversation format
154
+ is_conversation = (
155
+ isinstance (content , list ) and
156
+ all (isinstance (msg , dict ) and "role" in msg and "content" in msg for msg in content )
157
+ )
158
+ if isinstance (content , list ) and not is_conversation :
159
+ raise InvalidInputError (
160
+ "Invalid content structure for conversation. "
161
+ "Please provide a list of dicts with role and content fields."
162
+ )
163
+
164
+ # Prepare messages
165
+ if is_conversation :
166
+ messages = content
167
+ else :
168
+ messages = [{"role" : "user" , "content" : content }]
130
169
131
- # logger.info(f"Evaluating model-specific metric {metric.name}.")
132
- logger .info (f"We assume you're using { metric .model_pattern } type model. If not, please do not use this metric and use a normal metric instead." )
133
-
134
- # vLLM applies model's chat template automatically
135
- llm_response :str = await self ._call_model (messages , sampling_params , return_choices = False )
136
-
137
- # Use metric's parser
138
- return metric .parser_func (llm_response )
170
+ logger .info (
171
+ f"We assume you're using { metric .model_pattern } type model. "
172
+ f"If not, please do not use this metric and use a normal metric instead."
173
+ )
139
174
140
- # Handle normal metrics
141
- # Handle metric parameter
142
- metric_template_vars = {}
175
+ # Get model response and parse
176
+ llm_response = await self ._call_model (messages , sampling_params , return_choices = False )
177
+ return metric .parser_func (llm_response )
178
+
179
+ def _prepare_evaluation_params (
180
+ self ,
181
+ metric : Optional [Metric ],
182
+ criteria : Optional [str ],
183
+ rubric : Union [str , Dict [Union [int , float ], str ], None ],
184
+ scale : Optional [Tuple [int , int ]],
185
+ examples : Optional [List [Dict [str , Any ]]],
186
+ system_prompt : Optional [str ],
187
+ template_engine : Union [str , TemplateEngine ]
188
+ ) -> Dict [str , Any ]:
189
+ """Prepare evaluation parameters, merging metric defaults with user overrides."""
190
+ params = {
191
+ "criteria" : criteria ,
192
+ "rubric" : rubric ,
193
+ "scale" : scale ,
194
+ "examples" : examples ,
195
+ "system_prompt" : system_prompt ,
196
+ "template_engine" : template_engine ,
197
+ "metric_template_vars" : {}
198
+ }
143
199
144
200
if metric :
145
201
# Use metric defaults but allow overrides
146
- criteria = criteria or metric .criteria
147
- rubric = rubric or metric .rubric
148
- scale = scale or metric .scale
149
- examples = examples or metric .examples
150
- system_prompt = system_prompt or metric .system_prompt
151
- metric_template_vars = metric .template_vars
202
+ params [ " criteria" ] = criteria or metric .criteria
203
+ params [ " rubric" ] = rubric or metric .rubric
204
+ params [ " scale" ] = scale or metric .scale
205
+ params [ " examples" ] = examples or metric .examples
206
+ params [ " system_prompt" ] = system_prompt or metric .system_prompt
207
+ params [ " metric_template_vars" ] = metric .template_vars
152
208
if metric .template_engine :
153
- template_engine = metric .template_engine
209
+ params [ " template_engine" ] = metric .template_engine
154
210
155
- # Validate inputs
156
- if not criteria :
211
+ # Validate required parameters
212
+ if not params [ " criteria" ] :
157
213
raise InvalidInputError ("Either 'criteria' or 'metric' must be provided" )
158
214
215
+ return params
216
+
217
+ def _process_templates (
218
+ self ,
219
+ params : Dict [str , Any ],
220
+ template_vars : Optional [Dict [str , Any ]],
221
+ input_text : Optional [str ],
222
+ context : Optional [str ]
223
+ ) -> Dict [str , Any ]:
224
+ """Process all template variables and return processed parameters."""
159
225
# Determine template engine
160
- engine = TemplateEngine (template_engine )
226
+ engine = TemplateEngine (params [ " template_engine" ] )
161
227
162
228
# Merge template variables (metric defaults + user provided)
163
- all_template_vars = {** metric_template_vars , ** (template_vars or {})}
164
- # Add input to template variables if provided
165
- if input :
166
- all_template_vars ["input" ] = input
229
+ all_template_vars = {** params ["metric_template_vars" ], ** (template_vars or {})}
230
+ if input_text :
231
+ all_template_vars ["input" ] = input_text
167
232
168
- # Process templates
169
- criteria = TemplateProcessor . apply_template (
170
- criteria , all_template_vars , engine , strict = True
171
- )
172
- rubric = TemplateProcessor . apply_template (
173
- rubric , all_template_vars , engine , strict = True
174
- )
175
- system_prompt = TemplateProcessor . apply_template (
176
- system_prompt , all_template_vars , engine , strict = True
177
- )
178
- context = TemplateProcessor .apply_template (
233
+ # Process templates for all relevant fields
234
+ template_fields = [ "criteria" , "rubric" , "system_prompt" ]
235
+ processed = {}
236
+
237
+ for field in template_fields :
238
+ processed [ field ] = TemplateProcessor . apply_template (
239
+ params [ field ], all_template_vars , engine , strict = True
240
+ )
241
+
242
+ # Process additional fields
243
+ processed [ " context" ] = TemplateProcessor .apply_template (
179
244
context , all_template_vars , engine , strict = True
180
245
)
181
- input = TemplateProcessor .apply_template (
182
- input , all_template_vars , engine , strict = True
246
+ processed [ " input" ] = TemplateProcessor .apply_template (
247
+ input_text , all_template_vars , engine , strict = True
183
248
)
184
249
250
+ # Copy other parameters
251
+ processed .update ({
252
+ "scale" : params ["scale" ],
253
+ "examples" : params ["examples" ],
254
+ "template_vars" : all_template_vars ,
255
+ "template_engine" : engine
256
+ })
257
+
258
+ return processed
259
+
260
+ async def _execute_evaluation (
261
+ self ,
262
+ content : Union [str , Dict [str , str ], List [Dict [str , str ]]],
263
+ params : Dict [str , Any ],
264
+ sampling_params : Optional [Dict [str , Any ]],
265
+ ** kwargs
266
+ ) -> EvaluationResult :
267
+ """Execute the evaluation with processed parameters."""
185
268
# Build messages
186
269
messages = PromptBuilder .build_messages (
187
270
content = content ,
188
- input = input ,
189
- criteria = criteria ,
190
- rubric = rubric ,
191
- scale = scale ,
192
- examples = examples ,
193
- system_prompt = system_prompt ,
194
- context = context ,
271
+ input = params [ " input" ] ,
272
+ criteria = params [ " criteria" ] ,
273
+ rubric = params [ " rubric" ] ,
274
+ scale = params [ " scale" ] ,
275
+ examples = params [ " examples" ] ,
276
+ system_prompt = params [ " system_prompt" ] ,
277
+ context = params [ " context" ] ,
195
278
** kwargs
196
279
)
197
280
198
- # Get LLM response. We don't need choices for now.
199
- llm_response : str = await self ._call_model (messages , sampling_params , return_choices = False )
200
-
281
+ # Get LLM response
282
+ llm_response = await self ._call_model (messages , sampling_params , return_choices = False )
283
+
201
284
# Parse response
202
285
result = self ._parse_response (llm_response )
203
286
204
287
# Add template info to metadata if used
205
- if all_template_vars :
206
- result .metadata ["template_vars" ] = all_template_vars
207
- result .metadata ["template_engine" ] = engine .value
288
+ if params [ "template_vars" ] :
289
+ result .metadata ["template_vars" ] = params [ "template_vars" ]
290
+ result .metadata ["template_engine" ] = params [ "template_engine" ] .value
208
291
209
292
return result
210
293
0 commit comments