@@ -107,113 +107,191 @@ async def evaluate(
107
107
MetricNotFoundError: If metric name not found
108
108
ParseError: If unable to parse model response
109
109
"""
110
+ # Resolve metric if string
111
+ resolved_metric = self ._resolve_metric (metric )
112
+
113
+ # Handle model-specific metrics early
114
+ if isinstance (resolved_metric , ModelSpecificMetric ):
115
+ return await self ._evaluate_model_specific_metric (
116
+ resolved_metric , content , sampling_params
117
+ )
118
+
119
+ # Process normal evaluation
120
+ evaluation_params = self ._prepare_evaluation_params (
121
+ resolved_metric , criteria , rubric , scale , examples ,
122
+ system_prompt , template_engine
123
+ )
124
+
125
+ # Process templates
126
+ processed_params = self ._process_templates (
127
+ evaluation_params , template_vars , input , context
128
+ )
129
+
130
+ # Build and execute evaluation
131
+ return await self ._execute_evaluation (
132
+ content , processed_params , sampling_params , ** kwargs
133
+ )
134
+
135
+ def _resolve_metric (self , metric : Union [Metric , str , None ]) -> Optional [Metric ]:
136
+ """Resolve metric string to Metric object."""
110
137
if metric and isinstance (metric , str ):
111
- metric : Metric = self .get_metric (metric )
112
-
113
- # Handle model-specific metrics
114
- if isinstance (metric , ModelSpecificMetric ):
115
- if isinstance (content , dict ):
116
- raise InvalidInputError ("Model-specific metrics only support string and list of dicts as content for now" )
117
-
118
- if isinstance (content , list ) and len (content ) == 0 :
119
- raise InvalidInputError ("Conversation content cannot be an empty list." )
120
-
121
- is_conversation = (
122
- isinstance (content , list ) and
123
- all (isinstance (msg , dict ) and "role" in msg and "content" in msg for msg in content )
138
+ return self .get_metric (metric )
139
+ return metric
140
+
141
+ async def _evaluate_model_specific_metric (
142
+ self ,
143
+ metric : ModelSpecificMetric ,
144
+ content : Union [str , List [Dict [str , str ]]],
145
+ sampling_params : Optional [Dict [str , Any ]]
146
+ ) -> EvaluationResult :
147
+ """Handle evaluation for model-specific metrics."""
148
+ # Validate content for model-specific metrics
149
+ if isinstance (content , dict ):
150
+ raise InvalidInputError (
151
+ "Model-specific metrics only support string and list of dicts as content for now"
124
152
)
125
- if isinstance (content , list ) and not is_conversation :
126
- raise InvalidInputError ("Invalid content structure for conversation. Please provide a list of dicts with role and content fields." )
127
-
128
-
129
- # Skip ALL our formatting
130
- if is_conversation :
131
- messages = content
132
- else :
133
- messages = [{"role" : "user" , "content" : content }]
134
-
135
- # logger.info(f"Evaluating model-specific metric {metric.name}.")
136
- logger .info (f"We assume you're using { metric .model_pattern } type model. If not, please do not use this metric and use a normal metric instead." )
153
+
154
+ if isinstance (content , list ) and len (content ) == 0 :
155
+ raise InvalidInputError ("Conversation content cannot be an empty list." )
156
+
157
+ # Validate conversation format
158
+ is_conversation = (
159
+ isinstance (content , list ) and
160
+ all (isinstance (msg , dict ) and "role" in msg and "content" in msg for msg in content )
161
+ )
162
+ if isinstance (content , list ) and not is_conversation :
163
+ raise InvalidInputError (
164
+ "Invalid content structure for conversation. "
165
+ "Please provide a list of dicts with role and content fields."
166
+ )
167
+
168
+ # Prepare messages
169
+ if is_conversation :
170
+ messages = content
171
+ else :
172
+ messages = [{"role" : "user" , "content" : content }]
137
173
138
- if metric .sampling_params :
139
- if sampling_params is None :
140
- sampling_params = {}
141
- sampling_params .update (metric .sampling_params )
142
-
143
- # vLLM applies model's chat template automatically
144
- llm_response = await self ._call_model (messages , sampling_params , return_choices = metric .return_choices )
145
-
146
- # Use metric's parser
147
- return metric .parser_func (llm_response )
174
+ logger .info (
175
+ f"We assume you're using { metric .model_pattern } type model. "
176
+ f"If not, please do not use this metric and use a normal metric instead."
177
+ )
148
178
149
- # Handle normal metrics
150
- # Handle metric parameter
151
- metric_template_vars = {}
179
+ # Get model response and parse
180
+ llm_response = await self ._call_model (messages , sampling_params , return_choices = False )
181
+ return metric .parser_func (llm_response )
182
+
183
+ def _prepare_evaluation_params (
184
+ self ,
185
+ metric : Optional [Metric ],
186
+ criteria : Optional [str ],
187
+ rubric : Union [str , Dict [Union [int , float ], str ], None ],
188
+ scale : Optional [Tuple [int , int ]],
189
+ examples : Optional [List [Dict [str , Any ]]],
190
+ system_prompt : Optional [str ],
191
+ template_engine : Union [str , TemplateEngine ]
192
+ ) -> Dict [str , Any ]:
193
+ """Prepare evaluation parameters, merging metric defaults with user overrides."""
194
+ params = {
195
+ "criteria" : criteria ,
196
+ "rubric" : rubric ,
197
+ "scale" : scale ,
198
+ "examples" : examples ,
199
+ "system_prompt" : system_prompt ,
200
+ "template_engine" : template_engine ,
201
+ "metric_template_vars" : {}
202
+ }
152
203
153
204
if metric :
154
205
# Use metric defaults but allow overrides
155
- criteria = criteria or metric .criteria
156
- rubric = rubric or metric .rubric
157
- scale = scale or metric .scale
158
- examples = examples or metric .examples
159
- system_prompt = system_prompt or metric .system_prompt
160
- metric_template_vars = metric .template_vars
206
+ params [ " criteria" ] = criteria or metric .criteria
207
+ params [ " rubric" ] = rubric or metric .rubric
208
+ params [ " scale" ] = scale or metric .scale
209
+ params [ " examples" ] = examples or metric .examples
210
+ params [ " system_prompt" ] = system_prompt or metric .system_prompt
211
+ params [ " metric_template_vars" ] = metric .template_vars
161
212
if metric .template_engine :
162
- template_engine = metric .template_engine
213
+ params [ " template_engine" ] = metric .template_engine
163
214
164
- # Validate inputs
165
- if not criteria :
215
+ # Validate required parameters
216
+ if not params [ " criteria" ] :
166
217
raise InvalidInputError ("Either 'criteria' or 'metric' must be provided" )
167
218
219
+ return params
220
+
221
+ def _process_templates (
222
+ self ,
223
+ params : Dict [str , Any ],
224
+ template_vars : Optional [Dict [str , Any ]],
225
+ input_text : Optional [str ],
226
+ context : Optional [str ]
227
+ ) -> Dict [str , Any ]:
228
+ """Process all template variables and return processed parameters."""
168
229
# Determine template engine
169
- engine = TemplateEngine (template_engine )
230
+ engine = TemplateEngine (params [ " template_engine" ] )
170
231
171
232
# Merge template variables (metric defaults + user provided)
172
- all_template_vars = {** metric_template_vars , ** (template_vars or {})}
173
- # Add input to template variables if provided
174
- if input :
175
- all_template_vars ["input" ] = input
233
+ all_template_vars = {** params ["metric_template_vars" ], ** (template_vars or {})}
234
+ if input_text :
235
+ all_template_vars ["input" ] = input_text
176
236
177
- # Process templates
178
- criteria = TemplateProcessor . apply_template (
179
- criteria , all_template_vars , engine , strict = True
180
- )
181
- rubric = TemplateProcessor . apply_template (
182
- rubric , all_template_vars , engine , strict = True
183
- )
184
- system_prompt = TemplateProcessor . apply_template (
185
- system_prompt , all_template_vars , engine , strict = True
186
- )
187
- context = TemplateProcessor .apply_template (
237
+ # Process templates for all relevant fields
238
+ template_fields = [ "criteria" , "rubric" , "system_prompt" ]
239
+ processed = {}
240
+
241
+ for field in template_fields :
242
+ processed [ field ] = TemplateProcessor . apply_template (
243
+ params [ field ], all_template_vars , engine , strict = True
244
+ )
245
+
246
+ # Process additional fields
247
+ processed [ " context" ] = TemplateProcessor .apply_template (
188
248
context , all_template_vars , engine , strict = True
189
249
)
190
- input = TemplateProcessor .apply_template (
191
- input , all_template_vars , engine , strict = True
250
+ processed [ " input" ] = TemplateProcessor .apply_template (
251
+ input_text , all_template_vars , engine , strict = True
192
252
)
193
253
254
+ # Copy other parameters
255
+ processed .update ({
256
+ "scale" : params ["scale" ],
257
+ "examples" : params ["examples" ],
258
+ "template_vars" : all_template_vars ,
259
+ "template_engine" : engine
260
+ })
261
+
262
+ return processed
263
+
264
+ async def _execute_evaluation (
265
+ self ,
266
+ content : Union [str , Dict [str , str ], List [Dict [str , str ]]],
267
+ params : Dict [str , Any ],
268
+ sampling_params : Optional [Dict [str , Any ]],
269
+ ** kwargs
270
+ ) -> EvaluationResult :
271
+ """Execute the evaluation with processed parameters."""
194
272
# Build messages
195
273
messages = PromptBuilder .build_messages (
196
274
content = content ,
197
- input = input ,
198
- criteria = criteria ,
199
- rubric = rubric ,
200
- scale = scale ,
201
- examples = examples ,
202
- system_prompt = system_prompt ,
203
- context = context ,
275
+ input = params [ " input" ] ,
276
+ criteria = params [ " criteria" ] ,
277
+ rubric = params [ " rubric" ] ,
278
+ scale = params [ " scale" ] ,
279
+ examples = params [ " examples" ] ,
280
+ system_prompt = params [ " system_prompt" ] ,
281
+ context = params [ " context" ] ,
204
282
** kwargs
205
283
)
206
284
207
- # Get LLM response. We don't need choices for now.
208
- llm_response : str = await self ._call_model (messages , sampling_params , return_choices = False )
209
-
285
+ # Get LLM response
286
+ llm_response = await self ._call_model (messages , sampling_params , return_choices = False )
287
+
210
288
# Parse response
211
289
result = self ._parse_response (llm_response )
212
290
213
291
# Add template info to metadata if used
214
- if all_template_vars :
215
- result .metadata ["template_vars" ] = all_template_vars
216
- result .metadata ["template_engine" ] = engine .value
292
+ if params [ "template_vars" ] :
293
+ result .metadata ["template_vars" ] = params [ "template_vars" ]
294
+ result .metadata ["template_engine" ] = params [ "template_engine" ] .value
217
295
218
296
return result
219
297
0 commit comments