Skip to content

Commit d60092e

Browse files
committed
added support for MetricResult
1 parent 9c77367 commit d60092e

File tree

10 files changed

+739
-38
lines changed

10 files changed

+739
-38
lines changed

nbs/metric/base.ipynb

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,7 @@
2424
"execution_count": null,
2525
"id": "e8ccff58",
2626
"metadata": {},
27-
"outputs": [
28-
{
29-
"name": "stderr",
30-
"output_type": "stream",
31-
"text": [
32-
"/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
33-
" from .autonotebook import tqdm as notebook_tqdm\n"
34-
]
35-
}
36-
],
27+
"outputs": [],
3728
"source": [
3829
"#| export\n",
3930
"\n",
@@ -53,8 +44,16 @@
5344
"from ragas_annotator.project.core import Project\n",
5445
"from ragas_annotator.model.notion_model import NotionModel\n",
5546
"from ragas_annotator.prompt.dynamic_few_shot import DynamicFewShotPrompt\n",
56-
"\n",
57-
"\n",
47+
"\n"
48+
]
49+
},
50+
{
51+
"cell_type": "code",
52+
"execution_count": null,
53+
"metadata": {},
54+
"outputs": [],
55+
"source": [
56+
"#| export\n",
5857
"@dataclass\n",
5958
"class Metric(ABC):\n",
6059
" \"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\n",
@@ -174,7 +173,7 @@
174173
{
175174
"data": {
176175
"text/plain": [
177-
"100"
176+
"1"
178177
]
179178
},
180179
"execution_count": null,

nbs/metric/discrete.ipynb

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -90,17 +90,8 @@
9090
"name": "stdout",
9191
"output_type": "stream",
9292
"text": [
93-
"med\n",
94-
"The given input \"this is my response\" is too vague to provide a comprehensive evaluation.\n",
95-
"\n",
96-
"Positives:\n",
97-
"1. Clear Statement: It's a straightforward indication that a response has been provided.\n",
98-
"\n",
99-
"Negatives:\n",
100-
"1. Lack of Context: Without context or additional information, it's impossible to assess the relevance or accuracy of the response.\n",
101-
"2. No Specificity: The response doesn't convey any specific information or insight related to a topic or question.\n",
102-
"\n",
103-
"If this response was intended to be part of a conversation or instruction, more detail would be required to make it highly effective. At present, it serves as a neutral statement without actionable or informative content.\n"
93+
"low\n",
94+
"The response is incomplete and lacks any specific information. It cannot be evaluated for helpfulness without further context or content.\n"
10495
]
10596
}
10697
],
@@ -152,9 +143,12 @@
152143
"#| eval: false\n",
153144
"from ragas_annotator.metric.result import MetricResult\n",
154145
"\n",
155-
"@discrete_metric(llm=llm,\n",
146+
"@discrete_metric(\n",
147+
" llm=llm,\n",
156148
" prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
157-
" name='new_metric',values=[\"low\",\"med\",\"high\"])\n",
149+
" name='new_metric',\n",
150+
" values=[\"low\",\"med\",\"high\"]\n",
151+
")\n",
158152
"def my_metric(llm,prompt,**kwargs):\n",
159153
"\n",
160154
" class response_model(BaseModel):\n",

nbs/metric/result.ipynb

Lines changed: 112 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,25 @@
2222
{
2323
"cell_type": "code",
2424
"execution_count": null,
25-
"id": "0f1c801a-6568-4ba4-8bbe-30bf154174fe",
25+
"id": "dcc3080c",
2626
"metadata": {},
2727
"outputs": [],
2828
"source": [
2929
"#| export\n",
3030
"\n",
3131
"import typing as t\n",
3232
"\n",
33-
"\n",
34-
"\n",
35-
"\n",
36-
"\n",
33+
"from fastcore.utils import patch"
34+
]
35+
},
36+
{
37+
"cell_type": "code",
38+
"execution_count": null,
39+
"id": "0f1c801a-6568-4ba4-8bbe-30bf154174fe",
40+
"metadata": {},
41+
"outputs": [],
42+
"source": [
43+
"#| export\n",
3744
"class MetricResult:\n",
3845
" \"\"\"Class to hold the result of a metric evaluation.\n",
3946
" \n",
@@ -248,10 +255,109 @@
248255
"print(list_result[1:]) # 2\n"
249256
]
250257
},
258+
{
259+
"cell_type": "markdown",
260+
"id": "06ce7a1d",
261+
"metadata": {},
262+
"source": [
263+
"now lets make it `Pydantic` compatible also"
264+
]
265+
},
266+
{
267+
"cell_type": "code",
268+
"execution_count": null,
269+
"id": "5d8fb818",
270+
"metadata": {},
271+
"outputs": [],
272+
"source": [
273+
"#| export\n",
274+
"from pydantic_core import core_schema\n",
275+
"from pydantic import GetCoreSchemaHandler, ValidationInfo"
276+
]
277+
},
278+
{
279+
"cell_type": "code",
280+
"execution_count": null,
281+
"id": "f4c288c0",
282+
"metadata": {},
283+
"outputs": [],
284+
"source": [
285+
"#| export\n",
286+
"\n",
287+
"@patch(cls_method=True)\n",
288+
"def validate(cls: MetricResult, value: t.Any, info: ValidationInfo):\n",
289+
" \"\"\"Provide compatibility with older Pydantic versions.\"\"\"\n",
290+
" if isinstance(value, MetricResult):\n",
291+
" return value\n",
292+
" return MetricResult(result=value)\n",
293+
"\n",
294+
"# Add Pydantic compatibility methods\n",
295+
"@patch(cls_method=True)\n",
296+
"def __get_pydantic_core_schema__(\n",
297+
" cls: MetricResult, \n",
298+
" _source_type: t.Any, \n",
299+
" _handler: GetCoreSchemaHandler\n",
300+
") -> core_schema.CoreSchema:\n",
301+
" \"\"\"Generate a Pydantic core schema for MetricResult.\"\"\"\n",
302+
" return core_schema.with_info_plain_validator_function(cls.validate)\n",
303+
"\n",
304+
"\n",
305+
"@patch\n",
306+
"def model_dump(self: MetricResult):\n",
307+
" \"\"\"Support Pydantic's model_dump method.\"\"\"\n",
308+
" return self.to_dict()"
309+
]
310+
},
311+
{
312+
"cell_type": "code",
313+
"execution_count": null,
314+
"id": "f49739a6",
315+
"metadata": {},
316+
"outputs": [],
317+
"source": [
318+
"from pydantic import BaseModel\n",
319+
"\n",
320+
"class TestModel(BaseModel):\n",
321+
" response: str\n",
322+
" grade: MetricResult\n",
323+
" faithfulness: MetricResult\n"
324+
]
325+
},
326+
{
327+
"cell_type": "code",
328+
"execution_count": null,
329+
"id": "6ac6b955",
330+
"metadata": {},
331+
"outputs": [],
332+
"source": [
333+
"m = TestModel(response=\"test\", grade=MetricResult(result=1, reason=\"test\"), faithfulness=MetricResult(result=1, reason=\"test\"))"
334+
]
335+
},
336+
{
337+
"cell_type": "code",
338+
"execution_count": null,
339+
"id": "4ffe750f",
340+
"metadata": {},
341+
"outputs": [
342+
{
343+
"data": {
344+
"text/plain": [
345+
"'test'"
346+
]
347+
},
348+
"execution_count": null,
349+
"metadata": {},
350+
"output_type": "execute_result"
351+
}
352+
],
353+
"source": [
354+
"m.grade.reason"
355+
]
356+
},
251357
{
252358
"cell_type": "code",
253359
"execution_count": null,
254-
"id": "a984dde9",
360+
"id": "9d32b10f",
255361
"metadata": {},
256362
"outputs": [],
257363
"source": []

nbs/project/experiments.ipynb

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,74 @@
363363
" return decorator"
364364
]
365365
},
366+
{
367+
"cell_type": "code",
368+
"execution_count": null,
369+
"metadata": {},
370+
"outputs": [],
371+
"source": [
372+
"# create experimental test dataset\n",
373+
"test_dataset = p.create_dataset(name=\"test dataset for experiment\", model=TestModel)\n",
374+
"test_dataset.append(TestModel(name=\"test item 1\", description=\"test item 1 description\", price=100))\n",
375+
"test_dataset.append(TestModel(name=\"test item 2\", description=\"test item 2 description\", price=200))\n",
376+
"test_dataset.append(TestModel(name=\"test item 3\", description=\"test item 3 description\", price=300))"
377+
]
378+
},
379+
{
380+
"cell_type": "code",
381+
"execution_count": null,
382+
"metadata": {},
383+
"outputs": [],
384+
"source": [
385+
"# create experiment model\n",
386+
"class TextExperimentModel(TestModel):\n",
387+
" response: str\n",
388+
" is_correct: t.Literal[\"yes\", \"no\"]\n",
389+
"\n",
390+
"# create a test experiment function\n",
391+
"@p.experiment(TextExperimentModel)\n",
392+
"async def test_experiment(item: TestModel):\n",
393+
" print(item)\n",
394+
" return TextExperimentModel(**item.model_dump(), response=\"test response\", is_correct=\"yes\")\n"
395+
]
396+
},
397+
{
398+
"cell_type": "code",
399+
"execution_count": null,
400+
"metadata": {},
401+
"outputs": [
402+
{
403+
"name": "stderr",
404+
"output_type": "stream",
405+
"text": [
406+
"100%|██████████| 3/3 [00:00<00:00, 7752.87it/s]\n"
407+
]
408+
},
409+
{
410+
"name": "stdout",
411+
"output_type": "stream",
412+
"text": [
413+
"name='test item 2' description='test item 2 description' price=200.0\n",
414+
"name='test item 1' description='test item 1 description' price=100.0\n",
415+
"name='test item 3' description='test item 3 description' price=300.0\n"
416+
]
417+
},
418+
{
419+
"data": {
420+
"text/plain": [
421+
"Experiment(name=keen_backus, model=TextExperimentModel)"
422+
]
423+
},
424+
"execution_count": null,
425+
"metadata": {},
426+
"output_type": "execute_result"
427+
}
428+
],
429+
"source": [
430+
"# run the experiment\n",
431+
"await test_experiment.run_async(test_dataset)"
432+
]
433+
},
366434
{
367435
"cell_type": "code",
368436
"execution_count": null,

0 commit comments

Comments
 (0)