|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 1, |
| 5 | + "execution_count": 5, |
6 | 6 | "metadata": {},
|
7 | 7 | "outputs": [],
|
8 | 8 | "source": [
|
|
11 | 11 | },
|
12 | 12 | {
|
13 | 13 | "cell_type": "code",
|
14 |
| - "execution_count": 2, |
| 14 | + "execution_count": 6, |
15 | 15 | "metadata": {},
|
16 | 16 | "outputs": [
|
17 | 17 | {
|
18 | 18 | "data": {
|
19 | 19 | "text/plain": [
|
20 |
| - "dict_keys(['helpfulness', 'accuracy', 'clarity', 'conciseness', 'relevance', 'safety', 'toxicity', 'code_quality', 'code_security', 'creativity', 'professionalism', 'educational_value', 'preference', 'appropriate', 'factual', 'medical_accuracy', 'legal_appropriateness', 'educational_content_template', 'code_review_template', 'customer_service_template', 'writing_quality_template', 'product_review_template', 'medical_info_template', 'api_docs_template'])" |
| 20 | + "dict_keys(['llama_guard_3_safety', 'helpfulness', 'accuracy', 'clarity', 'conciseness', 'relevance', 'safety', 'toxicity', 'code_quality', 'code_security', 'creativity', 'professionalism', 'educational_value', 'preference', 'appropriate', 'factual', 'medical_accuracy', 'legal_appropriateness', 'educational_content_template', 'code_review_template', 'customer_service_template', 'writing_quality_template', 'product_review_template', 'medical_info_template', 'api_docs_template'])" |
21 | 21 | ]
|
22 | 22 | },
|
23 |
| - "execution_count": 2, |
| 23 | + "execution_count": 6, |
24 | 24 | "metadata": {},
|
25 | 25 | "output_type": "execute_result"
|
26 | 26 | }
|
|
31 | 31 | },
|
32 | 32 | {
|
33 | 33 | "cell_type": "code",
|
34 |
| - "execution_count": 2, |
| 34 | + "execution_count": 7, |
35 | 35 | "metadata": {},
|
36 | 36 | "outputs": [],
|
37 | 37 | "source": [
|
|
40 | 40 | },
|
41 | 41 | {
|
42 | 42 | "cell_type": "code",
|
43 |
| - "execution_count": 4, |
| 43 | + "execution_count": 8, |
44 | 44 | "metadata": {},
|
45 | 45 | "outputs": [],
|
46 | 46 | "source": [
|
|
50 | 50 | },
|
51 | 51 | {
|
52 | 52 | "cell_type": "code",
|
53 |
| - "execution_count": 5, |
| 53 | + "execution_count": 9, |
54 | 54 | "metadata": {},
|
55 | 55 | "outputs": [
|
56 | 56 | {
|
|
63 | 63 | " 'raw_response': '{\\n \"decision\": false,\\n \"reasoning\": \"The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.\",\\n \"score\": null\\n}'}}"
|
64 | 64 | ]
|
65 | 65 | },
|
66 |
| - "execution_count": 5, |
| 66 | + "execution_count": 9, |
67 | 67 | "metadata": {},
|
68 | 68 | "output_type": "execute_result"
|
69 | 69 | }
|
|
74 | 74 | },
|
75 | 75 | {
|
76 | 76 | "cell_type": "code",
|
77 |
| - "execution_count": 5, |
| 77 | + "execution_count": 10, |
78 | 78 | "metadata": {},
|
79 | 79 | "outputs": [
|
80 | 80 | {
|
|
87 | 87 | " 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.\",\\n \"score\": 5\\n}'}}"
|
88 | 88 | ]
|
89 | 89 | },
|
90 |
| - "execution_count": 5, |
| 90 | + "execution_count": 10, |
91 | 91 | "metadata": {},
|
92 | 92 | "output_type": "execute_result"
|
93 | 93 | }
|
|
101 | 101 | },
|
102 | 102 | {
|
103 | 103 | "cell_type": "code",
|
104 |
| - "execution_count": 6, |
| 104 | + "execution_count": 11, |
105 | 105 | "metadata": {},
|
106 | 106 | "outputs": [
|
107 | 107 | {
|
|
114 | 114 | " 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.\",\\n \"score\": 5\\n}'}}"
|
115 | 115 | ]
|
116 | 116 | },
|
117 |
| - "execution_count": 6, |
| 117 | + "execution_count": 11, |
118 | 118 | "metadata": {},
|
119 | 119 | "output_type": "execute_result"
|
120 | 120 | }
|
|
132 | 132 | },
|
133 | 133 | {
|
134 | 134 | "cell_type": "code",
|
135 |
| - "execution_count": 8, |
| 135 | + "execution_count": 12, |
136 | 136 | "metadata": {},
|
137 | 137 | "outputs": [],
|
138 | 138 | "source": [
|
|
146 | 146 | },
|
147 | 147 | {
|
148 | 148 | "cell_type": "code",
|
149 |
| - "execution_count": 9, |
| 149 | + "execution_count": 13, |
150 | 150 | "metadata": {},
|
151 | 151 | "outputs": [
|
152 | 152 | {
|
|
159 | 159 | " 'raw_response': '{\\n \"decision\": \"moderate\",\\n \"reasoning\": \"The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.\",\\n \"score\": 5\\n}'}}"
|
160 | 160 | ]
|
161 | 161 | },
|
162 |
| - "execution_count": 9, |
| 162 | + "execution_count": 13, |
163 | 163 | "metadata": {},
|
164 | 164 | "output_type": "execute_result"
|
165 | 165 | }
|
166 | 166 | ],
|
167 | 167 | "source": [
|
168 |
| - "res = await judge.evaluate(response=\"I want to bump the version to 1.0.1, is it a good idea?\",\n", |
| 168 | + "res = await judge.evaluate(content=\"I want to bump the version to 1.0.1, is it a good idea?\",\n", |
169 | 169 | " metric=professional_tone_metric)\n",
|
170 | 170 | "res.model_dump()"
|
171 | 171 | ]
|
172 | 172 | },
|
173 | 173 | {
|
174 | 174 | "cell_type": "code",
|
175 |
| - "execution_count": 10, |
| 175 | + "execution_count": 14, |
176 | 176 | "metadata": {},
|
177 | 177 | "outputs": [
|
178 | 178 | {
|
|
185 | 185 | " 'raw_response': '{\\n \"decision\": \"non-professional\",\\n \"reasoning\": \"The response uses informal and expletive language, which is not appropriate for a professional context.\",\\n \"score\": 1\\n}'}}"
|
186 | 186 | ]
|
187 | 187 | },
|
188 |
| - "execution_count": 10, |
| 188 | + "execution_count": 14, |
189 | 189 | "metadata": {},
|
190 | 190 | "output_type": "execute_result"
|
191 | 191 | }
|
192 | 192 | ],
|
193 | 193 | "source": [
|
194 |
| - "res = await judge.evaluate(response=\"Holy shit, this is a great!\",\n", |
| 194 | + "res = await judge.evaluate(content=\"Holy shit, this is a great!\",\n", |
195 | 195 | " metric=professional_tone_metric)\n",
|
196 | 196 | "res.model_dump()"
|
197 | 197 | ]
|
198 | 198 | },
|
| 199 | + { |
| 200 | + "cell_type": "code", |
| 201 | + "execution_count": 15, |
| 202 | + "metadata": {}, |
| 203 | + "outputs": [ |
| 204 | + { |
| 205 | + "data": { |
| 206 | + "text/plain": [ |
| 207 | + "{'decision': True,\n", |
| 208 | + " 'reasoning': 'The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.',\n", |
| 209 | + " 'score': None,\n", |
| 210 | + " 'metadata': {'model': 'qwen2',\n", |
| 211 | + " 'raw_response': '{\\n \"decision\": true,\\n \"reasoning\": \"The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.\",\\n \"score\": null\\n}',\n", |
| 212 | + " 'template_vars': {'input': 'What is the capital of France?'},\n", |
| 213 | + " 'template_engine': 'format'}}" |
| 214 | + ] |
| 215 | + }, |
| 216 | + "execution_count": 15, |
| 217 | + "metadata": {}, |
| 218 | + "output_type": "execute_result" |
| 219 | + } |
| 220 | + ], |
| 221 | + "source": [ |
| 222 | + "res = await judge.evaluate(\n", |
| 223 | + " input=\"What is the capital of France?\",\n", |
| 224 | + " content=\"Paris is the capital of France\",\n", |
| 225 | + " criteria=\"accuracy and completeness\"\n", |
| 226 | + ")\n", |
| 227 | + "\n", |
| 228 | + "res.model_dump()" |
| 229 | + ] |
| 230 | + }, |
| 231 | + { |
| 232 | + "cell_type": "code", |
| 233 | + "execution_count": 16, |
| 234 | + "metadata": {}, |
| 235 | + "outputs": [ |
| 236 | + { |
| 237 | + "data": { |
| 238 | + "text/plain": [ |
| 239 | + "{'decision': True,\n", |
| 240 | + " 'reasoning': 'The response correctly identifies Paris as the capital of France, which is accurate and complete.',\n", |
| 241 | + " 'score': 10.0,\n", |
| 242 | + " 'metadata': {'model': 'qwen2',\n", |
| 243 | + " 'raw_response': '{\\n \"decision\": true,\\n \"reasoning\": \"The response correctly identifies Paris as the capital of France, which is accurate and complete.\",\\n \"score\": 10\\n}',\n", |
| 244 | + " 'template_vars': {'input': 'What is the capital of France?'},\n", |
| 245 | + " 'template_engine': 'format'}}" |
| 246 | + ] |
| 247 | + }, |
| 248 | + "execution_count": 16, |
| 249 | + "metadata": {}, |
| 250 | + "output_type": "execute_result" |
| 251 | + } |
| 252 | + ], |
| 253 | + "source": [ |
| 254 | + "# Or using the convenience method\n", |
| 255 | + "res = await judge.qa_evaluate(\n", |
| 256 | + " question=\"What is the capital of France?\",\n", |
| 257 | + " answer=\"Paris is the capital of France\"\n", |
| 258 | + ")\n", |
| 259 | + "res.model_dump()" |
| 260 | + ] |
| 261 | + }, |
199 | 262 | {
|
200 | 263 | "cell_type": "code",
|
201 | 264 | "execution_count": 1,
|
|
216 | 279 | "data": {
|
217 | 280 | "text/plain": [
|
218 | 281 | "{'status': 'healthy',\n",
|
219 |
| - " 'version': '0.1.0',\n", |
| 282 | + " 'version': '0.1.3',\n", |
220 | 283 | " 'model': 'qwen2',\n",
|
221 | 284 | " 'base_url': 'http://localhost:8080',\n",
|
222 |
| - " 'uptime_seconds': 62.64390587806702,\n", |
223 |
| - " 'total_evaluations': 1,\n", |
| 285 | + " 'uptime_seconds': 12.22716999053955,\n", |
| 286 | + " 'total_evaluations': 0,\n", |
224 | 287 | " 'active_connections': 0,\n",
|
225 |
| - " 'metrics_available': 24}" |
| 288 | + " 'metrics_available': 25}" |
226 | 289 | ]
|
227 | 290 | },
|
228 | 291 | "execution_count": 2,
|
|
236 | 299 | },
|
237 | 300 | {
|
238 | 301 | "cell_type": "code",
|
239 |
| - "execution_count": 3, |
| 302 | + "execution_count": 4, |
240 | 303 | "metadata": {},
|
241 | 304 | "outputs": [
|
242 | 305 | {
|
|
249 | 312 | " 'raw_response': '{\\n \"decision\": false,\\n \"reasoning\": \"The response lacks technical detail and does not provide a substantive explanation of why Python is great.\",\\n \"score\": null\\n}'}}"
|
250 | 313 | ]
|
251 | 314 | },
|
252 |
| - "execution_count": 3, |
| 315 | + "execution_count": 4, |
253 | 316 | "metadata": {},
|
254 | 317 | "output_type": "execute_result"
|
255 | 318 | }
|
256 | 319 | ],
|
257 | 320 | "source": [
|
258 | 321 | "result = await client.evaluate(\n",
|
259 |
| - " response=\"Python is great!\",\n", |
| 322 | + " content=\"Python is great!\",\n", |
260 | 323 | " criteria=\"technical accuracy\"\n",
|
261 | 324 | ")\n",
|
262 | 325 | "result.model_dump() "
|
|
0 commit comments