You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
"""Create a mellea session with the specified backend and model."""
80
+
"""Create a mellea session with the specified backend and model."""
81
81
82
82
model_id=None
83
83
ifmodel:
@@ -164,7 +164,15 @@ def run_evaluations(
164
164
output_format: str,
165
165
continue_on_error: bool,
166
166
):
167
-
"""Run all 'unit test' evaluations"""
167
+
"""Run all 'unit test' evaluations
168
+
169
+
Each test file should be a json containing:
170
+
"id": an id that is unique to this test file
171
+
"source": the origin for the evaluation prompts, else "N/A"
172
+
"name": an instruction-following attribute that the user intends to evaluate through this test
173
+
"instructions": a set (in string form) of requirements which the generation should follow; the judge will evaluate if these are satisfied
174
+
"examples": a list of entries containing an input_id, an input(prompt), and a list of targets. Each input may have multiple (or no) targets; inputs and targets are in messages format.
175
+
"""
168
176
all_test_evals: List[TestBasedEval] = []
169
177
170
178
fortest_fileintest_files:
@@ -230,7 +238,7 @@ def execute_test_eval(
230
238
) ->TestEvalResult:
231
239
"""Execute a single test evaluation
232
240
For each input in the test, generate a response using generation_session
233
-
Then, after all inputs are processed, validate using judge_session
241
+
Then, after all inputs are processed, validate using judge_session.
The model output should adhere to the following guidelines:
17
+
{{ guidelines }}
18
+
19
+
**Scoring Criteria**
20
+
* Score 0: The model output violates any of the guidelines.
21
+
* Score 1: The model output is well aligned with the ground truth{%iftargetandtarget != "N/A"%} - if it exists{%endif%}, the input to the model, and adheres to all guidelines.
0 commit comments