Skip to content

Commit 7fdac33

Browse files
author
Dan McPherson
committed
Add unit tests for check_data
Signed-off-by: Dan McPherson <[email protected]>
1 parent a16fe23 commit 7fdac33

File tree

1 file changed

+190
-0
lines changed

1 file changed

+190
-0
lines changed

tests/test_mt_bench_common.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
# Standard
4+
from unittest import mock
5+
6+
# First Party
7+
from instructlab.eval.mt_bench_common import Judge, check_data
8+
9+
CHECK_DATA_EXAMPLE_QUESTIONS = [
10+
{
11+
"question_id": 81,
12+
"category": "writing",
13+
"turns": [
14+
"Fake question",
15+
"Fake question",
16+
],
17+
},
18+
{
19+
"question_id": 101,
20+
"category": "reasoning",
21+
"turns": [
22+
"Fake question",
23+
"Fake question",
24+
],
25+
},
26+
]
27+
CHECK_DATA_EXAMPLE_MODEL_ANSWERS = {
28+
"granite-7b-lab": {
29+
81: {
30+
"question_id": 81,
31+
"answer_id": "c4j9vPyHM8w3JHPGohrJQG",
32+
"model_id": "granite-7b-lab",
33+
"choices": [
34+
{
35+
"index": 0,
36+
"turns": [
37+
"Fake answer",
38+
"Fake answer",
39+
],
40+
}
41+
],
42+
"tstamp": 1730816201.883507,
43+
},
44+
101: {
45+
"question_id": 101,
46+
"answer_id": "kaQw7Fj2SDeE2VfvU25FJ4",
47+
"model_id": "granite-7b-lab",
48+
"choices": [
49+
{
50+
"index": 0,
51+
"turns": [
52+
"Fake answer",
53+
"Fake answer",
54+
],
55+
}
56+
],
57+
"tstamp": 1730816166.3719094,
58+
},
59+
}
60+
}
61+
CHECK_DATA_EXAMPLE_REFERENCE_ANSWERS = {
62+
"merlinite-7b-lab": {
63+
101: {
64+
"question_id": 101,
65+
"answer_id": "TFomieEmmAgdeCkvmuvwbc",
66+
"model_id": "gpt-4",
67+
"choices": [
68+
{
69+
"index": 0,
70+
"turns": [
71+
"Fake answer",
72+
"Fake answer",
73+
],
74+
}
75+
],
76+
"tstamp": 1686286924.844282,
77+
},
78+
102: {
79+
"question_id": 102,
80+
"answer_id": "hLH8WozvaB88bb5vV224H4",
81+
"model_id": "gpt-4",
82+
"choices": [
83+
{
84+
"index": 0,
85+
"turns": [
86+
"Fake answer",
87+
"Fake answer",
88+
],
89+
}
90+
],
91+
"tstamp": 1686286937.7164738,
92+
},
93+
}
94+
}
95+
96+
CHECK_DATA_EXAMPLE_MODELS = ["granite-7b-lab"]
97+
CHECK_DATA_EXAMPLE_JUDGES = {
98+
"default": Judge(
99+
model_name="merlinite-7b-lab",
100+
prompt_template={
101+
"name": "single-v1",
102+
"type": "single",
103+
"system_prompt": "Fake prompt",
104+
"prompt_template": "Fake prompt",
105+
"description": "Prompt for general questions",
106+
"category": "general",
107+
"output_format": "[[rating]]",
108+
},
109+
ref_based=False,
110+
multi_turn=False,
111+
),
112+
"math": Judge(
113+
model_name="merlinite-7b-lab",
114+
prompt_template={
115+
"name": "single-math-v1",
116+
"type": "single",
117+
"system_prompt": "Fake prompt",
118+
"prompt_template": "Fake prompt",
119+
"description": "Prompt for general questions",
120+
"category": "math",
121+
"output_format": "[[rating]]",
122+
},
123+
ref_based=True,
124+
multi_turn=False,
125+
),
126+
"default-mt": Judge(
127+
model_name="merlinite-7b-lab",
128+
prompt_template={
129+
"name": "single-v1-multi-turn",
130+
"type": "single",
131+
"system_prompt": "Fake prompt",
132+
"prompt_template": "Fake prompt",
133+
"description": "Prompt for general questions",
134+
"category": "general",
135+
"output_format": "[[rating]]",
136+
},
137+
ref_based=False,
138+
multi_turn=True,
139+
),
140+
"math-mt": Judge(
141+
model_name="merlinite-7b-lab",
142+
prompt_template={
143+
"name": "single-math-v1-multi-turn",
144+
"type": "single",
145+
"system_prompt": "Fake prompt",
146+
"prompt_template": "Fake prompt",
147+
"description": "Prompt for general questions",
148+
"category": "math",
149+
"output_format": "[[rating]]",
150+
},
151+
ref_based=True,
152+
multi_turn=True,
153+
),
154+
}
155+
156+
157+
def test_check_data():
158+
check_data(
159+
CHECK_DATA_EXAMPLE_QUESTIONS,
160+
CHECK_DATA_EXAMPLE_MODEL_ANSWERS,
161+
CHECK_DATA_EXAMPLE_REFERENCE_ANSWERS,
162+
CHECK_DATA_EXAMPLE_MODELS,
163+
CHECK_DATA_EXAMPLE_JUDGES,
164+
)
165+
166+
try:
167+
check_data(
168+
CHECK_DATA_EXAMPLE_QUESTIONS,
169+
{"granite-7b-lab": {}},
170+
CHECK_DATA_EXAMPLE_REFERENCE_ANSWERS,
171+
CHECK_DATA_EXAMPLE_MODELS,
172+
CHECK_DATA_EXAMPLE_JUDGES,
173+
)
174+
except Exception as e:
175+
assert "Missing model granite-7b-lab's answer to Question" in str(e)
176+
else:
177+
assert False, "Didn't fail with missing model answer"
178+
179+
try:
180+
check_data(
181+
CHECK_DATA_EXAMPLE_QUESTIONS,
182+
CHECK_DATA_EXAMPLE_MODEL_ANSWERS,
183+
{"merlinite-7b-lab": {}},
184+
CHECK_DATA_EXAMPLE_MODELS,
185+
CHECK_DATA_EXAMPLE_JUDGES,
186+
)
187+
except Exception as e:
188+
assert "Missing reference answer to Question" in str(e)
189+
else:
190+
assert False, "Didn't fail with missing reference answer"

0 commit comments

Comments
 (0)