Skip to content

Commit a164263

Browse files
committed
make metrics work with new llm interface
1 parent 0eb5002 commit a164263

File tree

8 files changed

+172
-31
lines changed

8 files changed

+172
-31
lines changed

nbs/metric/base.ipynb

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,14 @@
4343
"from pydantic import BaseModel\n",
4444
"import typing as t\n",
4545
"from ragas_annotator.metric import MetricResult\n",
46-
"from ragas_annotator.metric import LLM\n",
46+
"from ragas_annotator.llm import RagasLLM\n",
4747
"\n",
4848
"@dataclass\n",
4949
"class Metric(ABC):\n",
5050
" \"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\n",
5151
" name: str\n",
5252
" prompt: str\n",
53-
" llm: LLM\n",
53+
" llm: RagasLLM\n",
5454
" _response_models: t.Dict[bool, t.Type[BaseModel]] = field(\n",
5555
" default_factory=dict, init=False, repr=False\n",
5656
" )\n",
@@ -136,7 +136,13 @@
136136
]
137137
}
138138
],
139-
"metadata": {},
139+
"metadata": {
140+
"kernelspec": {
141+
"display_name": "python3",
142+
"language": "python",
143+
"name": "python3"
144+
}
145+
},
140146
"nbformat": 4,
141147
"nbformat_minor": 2
142148
}

nbs/metric/decorator.ipynb

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
"import asyncio\n",
3131
"from dataclasses import dataclass\n",
3232
"from ragas_annotator.metric import MetricResult\n",
33+
"from ragas_annotator.llm import RagasLLM\n",
3334
"\n",
3435
"\n",
3536
"\n",
@@ -44,7 +45,7 @@
4445
" Returns:\n",
4546
" A decorator factory function for the specified metric type\n",
4647
" \"\"\"\n",
47-
" def decorator_factory(llm, prompt, name: t.Optional[str] = None, **metric_params):\n",
48+
" def decorator_factory(llm:RagasLLM, prompt, name: t.Optional[str] = None, **metric_params):\n",
4849
" \"\"\"\n",
4950
" Creates a decorator that wraps a function into a metric instance.\n",
5051
" \n",
@@ -149,6 +150,29 @@
149150
"### Example usage\n"
150151
]
151152
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"metadata": {},
157+
"outputs": [
158+
{
159+
"data": {
160+
"text/plain": [
161+
"True"
162+
]
163+
},
164+
"execution_count": null,
165+
"metadata": {},
166+
"output_type": "execute_result"
167+
}
168+
],
169+
"source": [
170+
"from dotenv import load_dotenv\n",
171+
"\n",
172+
"# Load environment variables from a .envrc file\n",
173+
"load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')"
174+
]
175+
},
152176
{
153177
"cell_type": "code",
154178
"execution_count": null,
@@ -168,12 +192,16 @@
168192
"\n",
169193
"\n",
170194
"from ragas_annotator.metric import DiscreteMetric\n",
171-
"from ragas_annotator.metric.llm import LLM\n",
172195
"from pydantic import BaseModel\n",
173196
"\n",
197+
"from ragas_annotator.llm import ragas_llm\n",
198+
"from openai import OpenAI\n",
199+
"\n",
200+
"llm = ragas_llm(provider=\"openai\",model=\"gpt-4o\",client=OpenAI())\n",
201+
"\n",
174202
"discrete_metric = create_metric_decorator(DiscreteMetric)\n",
175203
"\n",
176-
"@discrete_metric(llm=LLM(),\n",
204+
"@discrete_metric(llm=llm,\n",
177205
" prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
178206
" name='new_metric',values=[\"low\",\"med\",\"high\"])\n",
179207
"def my_metric(llm,prompt,**kwargs):\n",

nbs/metric/discrete.ipynb

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,16 @@
2121
"cell_type": "code",
2222
"execution_count": null,
2323
"metadata": {},
24-
"outputs": [],
24+
"outputs": [
25+
{
26+
"name": "stderr",
27+
"output_type": "stream",
28+
"text": [
29+
"/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
30+
" from .autonotebook import tqdm as notebook_tqdm\n"
31+
]
32+
}
33+
],
2534
"source": [
2635
"#| export\n",
2736
"import typing as t\n",
@@ -81,6 +90,29 @@
8190
"## Example usage"
8291
]
8392
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"metadata": {},
97+
"outputs": [
98+
{
99+
"data": {
100+
"text/plain": [
101+
"True"
102+
]
103+
},
104+
"execution_count": null,
105+
"metadata": {},
106+
"output_type": "execute_result"
107+
}
108+
],
109+
"source": [
110+
"from dotenv import load_dotenv\n",
111+
"\n",
112+
"# Load environment variables from a .envrc file\n",
113+
"load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')"
114+
]
115+
},
84116
{
85117
"cell_type": "code",
86118
"execution_count": null,
@@ -91,18 +123,22 @@
91123
"output_type": "stream",
92124
"text": [
93125
"low\n",
94-
"No context or content was provided for evaluation.\n"
126+
"The response does not provide any specific information or context that can help evaluate its helpfulness.\n"
95127
]
96128
}
97129
],
98130
"source": [
99131
"\n",
100132
"#| eval: false\n",
101133
"\n",
102-
"from ragas_annotator.metric.llm import LLM\n",
134+
"from ragas_annotator.llm import ragas_llm\n",
135+
"from openai import OpenAI\n",
136+
"\n",
137+
"llm = ragas_llm(provider=\"openai\",model=\"gpt-4o\",client=OpenAI())\n",
138+
"\n",
103139
"\n",
104140
"my_metric = DiscreteMetric(\n",
105-
" llm=LLM(),\n",
141+
" llm=llm,\n",
106142
" name='helpfulness',\n",
107143
" prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
108144
" values=[\"low\",\"med\",\"high\"],\n",
@@ -130,14 +166,14 @@
130166
"name": "stdout",
131167
"output_type": "stream",
132168
"text": [
133-
"low\n",
169+
"high\n",
134170
"reason\n"
135171
]
136172
}
137173
],
138174
"source": [
139175
"#| eval: false\n",
140-
"@discrete_metric(llm=LLM(),\n",
176+
"@discrete_metric(llm=llm,\n",
141177
" prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
142178
" name='new_metric',values=[\"low\",\"med\",\"high\"])\n",
143179
"def my_metric(llm,prompt,**kwargs):\n",

nbs/metric/numeric.ipynb

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,16 @@
2121
"cell_type": "code",
2222
"execution_count": null,
2323
"metadata": {},
24-
"outputs": [],
24+
"outputs": [
25+
{
26+
"name": "stderr",
27+
"output_type": "stream",
28+
"text": [
29+
"/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
30+
" from .autonotebook import tqdm as notebook_tqdm\n"
31+
]
32+
}
33+
],
2534
"source": [
2635
"#| export\n",
2736
"\n",
@@ -81,7 +90,7 @@
8190
{
8291
"data": {
8392
"text/plain": [
84-
"'The response does not provide any context or information that can be evaluated as helpful.'"
93+
"True"
8594
]
8695
},
8796
"execution_count": null,
@@ -90,13 +99,41 @@
9099
}
91100
],
92101
"source": [
102+
"from dotenv import load_dotenv\n",
103+
"\n",
104+
"# Load environment variables from a .envrc file\n",
105+
"load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')"
106+
]
107+
},
108+
{
109+
"cell_type": "code",
110+
"execution_count": null,
111+
"metadata": {},
112+
"outputs": [
113+
{
114+
"data": {
115+
"text/plain": [
116+
"\"The provided input lacks context or content to determine if it is helpful as it merely states 'this is my response' without any additional information.\""
117+
]
118+
},
119+
"execution_count": null,
120+
"metadata": {},
121+
"output_type": "execute_result"
122+
}
123+
],
124+
"source": [
125+
"\n",
93126
"#| eval: false\n",
94127
"\n",
95-
"from ragas_annotator.metric.llm import LLM\n",
128+
"from ragas_annotator.llm import ragas_llm\n",
129+
"from openai import OpenAI\n",
130+
"\n",
131+
"llm = ragas_llm(provider=\"openai\",model=\"gpt-4o\",client=OpenAI())\n",
132+
"\n",
96133
"\n",
97134
"my_metric = NumericMetric(\n",
98135
" name='helpfulness',\n",
99-
" llm=LLM(),\n",
136+
" llm=llm,\n",
100137
" prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
101138
" range=(0,10),\n",
102139
")\n",
@@ -122,7 +159,7 @@
122159
{
123160
"data": {
124161
"text/plain": [
125-
"20"
162+
"10"
126163
]
127164
},
128165
"execution_count": null,
@@ -134,7 +171,7 @@
134171
"\n",
135172
"#| eval: false\n",
136173
"\n",
137-
"@numeric_metric(llm=LLM(),\n",
174+
"@numeric_metric(llm=llm,\n",
138175
" prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
139176
" name='new_metric',range=(0,10))\n",
140177
"def my_metric(llm,prompt,**kwargs):\n",

nbs/metric/ranking.ipynb

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,16 @@
2121
"cell_type": "code",
2222
"execution_count": null,
2323
"metadata": {},
24-
"outputs": [],
24+
"outputs": [
25+
{
26+
"name": "stderr",
27+
"output_type": "stream",
28+
"text": [
29+
"/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
30+
" from .autonotebook import tqdm as notebook_tqdm\n"
31+
]
32+
}
33+
],
2534
"source": [
2635
"#| export\n",
2736
"\n",
@@ -108,6 +117,29 @@
108117
"### Example usage"
109118
]
110119
},
120+
{
121+
"cell_type": "code",
122+
"execution_count": null,
123+
"metadata": {},
124+
"outputs": [
125+
{
126+
"data": {
127+
"text/plain": [
128+
"True"
129+
]
130+
},
131+
"execution_count": null,
132+
"metadata": {},
133+
"output_type": "execute_result"
134+
}
135+
],
136+
"source": [
137+
"from dotenv import load_dotenv\n",
138+
"\n",
139+
"# Load environment variables from a .envrc file\n",
140+
"load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')"
141+
]
142+
},
111143
{
112144
"cell_type": "code",
113145
"execution_count": null,
@@ -117,23 +149,26 @@
117149
"name": "stdout",
118150
"output_type": "stream",
119151
"text": [
120-
"[0, 1, 2]\n",
152+
"[2, 1, 0]\n",
121153
"Ensemble ranking based on multiple evaluations.\n",
122-
"The ranking is based on the length and detail of the responses, with 'short answer.' being the least detailed (rank 0), 'a bit more detailed.' being moderate (rank 1), and 'the longest and most detailed answer.' being the most comprehensive (rank 2).\n",
123-
"The ranking is based on the length and detail of the responses. The shortest response is ranked the lowest (0), the moderately detailed response is ranked higher (1), and the longest and most detailed response is ranked the highest (2).\n",
124-
"Ranking is based on length and detail; the longest answer (2) is most detailed, followed by a bit more detailed (1), and the shortest answer (0) is the least detailed.\n"
154+
"The ranking is based on the length and detail of each response. 'the longest and most detailed answer.' is the most comprehensive, followed by 'a bit more detailed.', and 'short answer.' is the briefest.\n",
155+
"The ranking is based on the length and detail of each response. The response 'the longest and most detailed answer.' is ranked highest (2) because it is the most detailed, followed by 'a bit more detailed.' (1), and finally 'short answer.' (0) as it is the least detailed.\n",
156+
"The responses are ranked based on the level of detail and length. 'short answer.' is the least detailed, 'a bit more detailed.' provides more information, and 'the longest and most detailed answer.' offers the most comprehensive explanation.\n"
125157
]
126158
}
127159
],
128160
"source": [
129161
"\n",
130162
"#| eval: false\n",
131163
"\n",
132-
"from ragas_annotator.metric.llm import LLM\n",
164+
"from ragas_annotator.llm import ragas_llm\n",
165+
"from openai import OpenAI\n",
166+
"\n",
167+
"llm = ragas_llm(provider=\"openai\",model=\"gpt-4o\",client=OpenAI())\n",
133168
"\n",
134169
"my_ranking_metric = RankingMetric(\n",
135170
" name='response_ranking',\n",
136-
" llm=LLM(), # Your language model instance\n",
171+
" llm=llm, # Your language model instance\n",
137172
" prompt=\"Rank the following responses:\\n{candidates}\",\n",
138173
" num_ranks=3,\n",
139174
")\n",
@@ -175,7 +210,7 @@
175210
"\n",
176211
"\n",
177212
"@ranking_metric(\n",
178-
" llm=LLM(), # Your language model instance\n",
213+
" llm=llm, # Your language model instance\n",
179214
" prompt=\"Rank the following responses:\\n{candidates}\",\n",
180215
" name='new_ranking_metric',\n",
181216
" num_ranks=3\n",

ragas_annotator/metric/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
from ragas_annotator.metric.result import MetricResult
2-
from ragas_annotator.metric.llm import LLM
32
from ragas_annotator.metric.base import Metric
43
from ragas_annotator.metric.discrete import DiscreteMetric
54
from ragas_annotator.metric.numeric import NumericMetric
65
from ragas_annotator.metric.ranking import RankingMetric
76

87
__all__ = ['MetricResult',
9-
'LLM',
108
'Metric',
119
'DiscreteMetric',
1210
'NumericMetric',

ragas_annotator/metric/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@
1212
from pydantic import BaseModel
1313
import typing as t
1414
from . import MetricResult
15-
from . import LLM
15+
from ..llm import RagasLLM
1616

1717
@dataclass
1818
class Metric(ABC):
1919
"""Base class for all metrics in the LLM evaluation library."""
2020
name: str
2121
prompt: str
22-
llm: LLM
22+
llm: RagasLLM
2323
_response_models: t.Dict[bool, t.Type[BaseModel]] = field(
2424
default_factory=dict, init=False, repr=False
2525
)

0 commit comments

Comments
 (0)