Skip to content

Commit b92ae0a

Browse files
add 'input' field for qa related evaluations
1 parent bbf3d52 commit b92ae0a

File tree

10 files changed

+523
-116
lines changed

10 files changed

+523
-116
lines changed

examples/basic_test.ipynb

Lines changed: 89 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": 5,
66
"metadata": {},
77
"outputs": [],
88
"source": [
@@ -11,16 +11,16 @@
1111
},
1212
{
1313
"cell_type": "code",
14-
"execution_count": 2,
14+
"execution_count": 6,
1515
"metadata": {},
1616
"outputs": [
1717
{
1818
"data": {
1919
"text/plain": [
20-
"dict_keys(['helpfulness', 'accuracy', 'clarity', 'conciseness', 'relevance', 'safety', 'toxicity', 'code_quality', 'code_security', 'creativity', 'professionalism', 'educational_value', 'preference', 'appropriate', 'factual', 'medical_accuracy', 'legal_appropriateness', 'educational_content_template', 'code_review_template', 'customer_service_template', 'writing_quality_template', 'product_review_template', 'medical_info_template', 'api_docs_template'])"
20+
"dict_keys(['llama_guard_3_safety', 'helpfulness', 'accuracy', 'clarity', 'conciseness', 'relevance', 'safety', 'toxicity', 'code_quality', 'code_security', 'creativity', 'professionalism', 'educational_value', 'preference', 'appropriate', 'factual', 'medical_accuracy', 'legal_appropriateness', 'educational_content_template', 'code_review_template', 'customer_service_template', 'writing_quality_template', 'product_review_template', 'medical_info_template', 'api_docs_template'])"
2121
]
2222
},
23-
"execution_count": 2,
23+
"execution_count": 6,
2424
"metadata": {},
2525
"output_type": "execute_result"
2626
}
@@ -31,7 +31,7 @@
3131
},
3232
{
3333
"cell_type": "code",
34-
"execution_count": 2,
34+
"execution_count": 7,
3535
"metadata": {},
3636
"outputs": [],
3737
"source": [
@@ -40,7 +40,7 @@
4040
},
4141
{
4242
"cell_type": "code",
43-
"execution_count": 4,
43+
"execution_count": 8,
4444
"metadata": {},
4545
"outputs": [],
4646
"source": [
@@ -50,7 +50,7 @@
5050
},
5151
{
5252
"cell_type": "code",
53-
"execution_count": 5,
53+
"execution_count": 9,
5454
"metadata": {},
5555
"outputs": [
5656
{
@@ -63,7 +63,7 @@
6363
" 'raw_response': '{\\n \"decision\": false,\\n \"reasoning\": \"The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.\",\\n \"score\": null\\n}'}}"
6464
]
6565
},
66-
"execution_count": 5,
66+
"execution_count": 9,
6767
"metadata": {},
6868
"output_type": "execute_result"
6969
}
@@ -74,7 +74,7 @@
7474
},
7575
{
7676
"cell_type": "code",
77-
"execution_count": 5,
77+
"execution_count": 10,
7878
"metadata": {},
7979
"outputs": [
8080
{
@@ -87,7 +87,7 @@
8787
" 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.\",\\n \"score\": 5\\n}'}}"
8888
]
8989
},
90-
"execution_count": 5,
90+
"execution_count": 10,
9191
"metadata": {},
9292
"output_type": "execute_result"
9393
}
@@ -101,7 +101,7 @@
101101
},
102102
{
103103
"cell_type": "code",
104-
"execution_count": 6,
104+
"execution_count": 11,
105105
"metadata": {},
106106
"outputs": [
107107
{
@@ -114,7 +114,7 @@
114114
" 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.\",\\n \"score\": 5\\n}'}}"
115115
]
116116
},
117-
"execution_count": 6,
117+
"execution_count": 11,
118118
"metadata": {},
119119
"output_type": "execute_result"
120120
}
@@ -132,7 +132,7 @@
132132
},
133133
{
134134
"cell_type": "code",
135-
"execution_count": 8,
135+
"execution_count": 12,
136136
"metadata": {},
137137
"outputs": [],
138138
"source": [
@@ -146,7 +146,7 @@
146146
},
147147
{
148148
"cell_type": "code",
149-
"execution_count": 9,
149+
"execution_count": 13,
150150
"metadata": {},
151151
"outputs": [
152152
{
@@ -159,20 +159,20 @@
159159
" 'raw_response': '{\\n \"decision\": \"moderate\",\\n \"reasoning\": \"The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.\",\\n \"score\": 5\\n}'}}"
160160
]
161161
},
162-
"execution_count": 9,
162+
"execution_count": 13,
163163
"metadata": {},
164164
"output_type": "execute_result"
165165
}
166166
],
167167
"source": [
168-
"res = await judge.evaluate(response=\"I want to bump the version to 1.0.1, is it a good idea?\",\n",
168+
"res = await judge.evaluate(content=\"I want to bump the version to 1.0.1, is it a good idea?\",\n",
169169
" metric=professional_tone_metric)\n",
170170
"res.model_dump()"
171171
]
172172
},
173173
{
174174
"cell_type": "code",
175-
"execution_count": 10,
175+
"execution_count": 14,
176176
"metadata": {},
177177
"outputs": [
178178
{
@@ -185,17 +185,80 @@
185185
" 'raw_response': '{\\n \"decision\": \"non-professional\",\\n \"reasoning\": \"The response uses informal and expletive language, which is not appropriate for a professional context.\",\\n \"score\": 1\\n}'}}"
186186
]
187187
},
188-
"execution_count": 10,
188+
"execution_count": 14,
189189
"metadata": {},
190190
"output_type": "execute_result"
191191
}
192192
],
193193
"source": [
194-
"res = await judge.evaluate(response=\"Holy shit, this is a great!\",\n",
194+
"res = await judge.evaluate(content=\"Holy shit, this is a great!\",\n",
195195
" metric=professional_tone_metric)\n",
196196
"res.model_dump()"
197197
]
198198
},
199+
{
200+
"cell_type": "code",
201+
"execution_count": 15,
202+
"metadata": {},
203+
"outputs": [
204+
{
205+
"data": {
206+
"text/plain": [
207+
"{'decision': True,\n",
208+
" 'reasoning': 'The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.',\n",
209+
" 'score': None,\n",
210+
" 'metadata': {'model': 'qwen2',\n",
211+
" 'raw_response': '{\\n \"decision\": true,\\n \"reasoning\": \"The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.\",\\n \"score\": null\\n}',\n",
212+
" 'template_vars': {'input': 'What is the capital of France?'},\n",
213+
" 'template_engine': 'format'}}"
214+
]
215+
},
216+
"execution_count": 15,
217+
"metadata": {},
218+
"output_type": "execute_result"
219+
}
220+
],
221+
"source": [
222+
"res = await judge.evaluate(\n",
223+
" input=\"What is the capital of France?\",\n",
224+
" content=\"Paris is the capital of France\",\n",
225+
" criteria=\"accuracy and completeness\"\n",
226+
")\n",
227+
"\n",
228+
"res.model_dump()"
229+
]
230+
},
231+
{
232+
"cell_type": "code",
233+
"execution_count": 16,
234+
"metadata": {},
235+
"outputs": [
236+
{
237+
"data": {
238+
"text/plain": [
239+
"{'decision': True,\n",
240+
" 'reasoning': 'The response correctly identifies Paris as the capital of France, which is accurate and complete.',\n",
241+
" 'score': 10.0,\n",
242+
" 'metadata': {'model': 'qwen2',\n",
243+
" 'raw_response': '{\\n \"decision\": true,\\n \"reasoning\": \"The response correctly identifies Paris as the capital of France, which is accurate and complete.\",\\n \"score\": 10\\n}',\n",
244+
" 'template_vars': {'input': 'What is the capital of France?'},\n",
245+
" 'template_engine': 'format'}}"
246+
]
247+
},
248+
"execution_count": 16,
249+
"metadata": {},
250+
"output_type": "execute_result"
251+
}
252+
],
253+
"source": [
254+
"# Or using the convenience method\n",
255+
"res = await judge.qa_evaluate(\n",
256+
" question=\"What is the capital of France?\",\n",
257+
" answer=\"Paris is the capital of France\"\n",
258+
")\n",
259+
"res.model_dump()"
260+
]
261+
},
199262
{
200263
"cell_type": "code",
201264
"execution_count": 1,
@@ -216,13 +279,13 @@
216279
"data": {
217280
"text/plain": [
218281
"{'status': 'healthy',\n",
219-
" 'version': '0.1.0',\n",
282+
" 'version': '0.1.3',\n",
220283
" 'model': 'qwen2',\n",
221284
" 'base_url': 'http://localhost:8080',\n",
222-
" 'uptime_seconds': 62.64390587806702,\n",
223-
" 'total_evaluations': 1,\n",
285+
" 'uptime_seconds': 12.22716999053955,\n",
286+
" 'total_evaluations': 0,\n",
224287
" 'active_connections': 0,\n",
225-
" 'metrics_available': 24}"
288+
" 'metrics_available': 25}"
226289
]
227290
},
228291
"execution_count": 2,
@@ -236,7 +299,7 @@
236299
},
237300
{
238301
"cell_type": "code",
239-
"execution_count": 3,
302+
"execution_count": 4,
240303
"metadata": {},
241304
"outputs": [
242305
{
@@ -249,14 +312,14 @@
249312
" 'raw_response': '{\\n \"decision\": false,\\n \"reasoning\": \"The response lacks technical detail and does not provide a substantive explanation of why Python is great.\",\\n \"score\": null\\n}'}}"
250313
]
251314
},
252-
"execution_count": 3,
315+
"execution_count": 4,
253316
"metadata": {},
254317
"output_type": "execute_result"
255318
}
256319
],
257320
"source": [
258321
"result = await client.evaluate(\n",
259-
" response=\"Python is great!\",\n",
322+
" content=\"Python is great!\",\n",
260323
" criteria=\"technical accuracy\"\n",
261324
")\n",
262325
"result.model_dump() "

0 commit comments

Comments
 (0)