Skip to content

Commit e16b6f5

Browse files
authored
feat: add simple scorer metric (#1291)
```python from ragas.dataset_schema import SingleTurnSample from ragas.metrics._simple_criteria import SimpleCriteriaScoreWithoutReference sample = SingleTurnSample( user_input="Where is the Eiffel Tower located?", response="The Eiffel Tower is located in Paris.", ) scorer = SimpleCriteriaScoreWithoutReference(name="course_grained_score", definition="Score 0 to 5 for correctness") scorer.llm = openai_model await scorer.single_turn_ascore(sample) ```
1 parent 6687cb5 commit e16b6f5

File tree

1 file changed

+332
-0
lines changed

1 file changed

+332
-0
lines changed
Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
import typing as t
5+
from collections import Counter
6+
from dataclasses import dataclass, field
7+
8+
from pydantic import BaseModel, Field
9+
10+
from ragas.dataset_schema import MultiTurnSample, SingleTurnSample
11+
from ragas.experimental.llms.prompt import PydanticPrompt
12+
from ragas.metrics.base import (
13+
MetricType,
14+
MetricWithLLM,
15+
MultiTurnMetric,
16+
SingleTurnMetric,
17+
)
18+
19+
if t.TYPE_CHECKING:
20+
from langchain_core.callbacks.base import Callbacks
21+
22+
23+
logger = logging.getLogger(__name__)
24+
25+
26+
class SimpleCriteriaOutput(BaseModel):
27+
reason: str = Field(description="Reason for the scoring")
28+
score: int = Field(description="The score for the submission")
29+
30+
31+
class SingleTurnSimpleCriteriaInput(BaseModel):
32+
user_input: str = Field(description="The input to the model")
33+
response: str = Field(description="The response from the model")
34+
criteria: str = Field(description="The criteria to evaluate the response")
35+
36+
37+
class SingleTurnSimpleCriteriaWithReferenceInput(SingleTurnSimpleCriteriaInput):
38+
reference: str = Field(description="The reference response")
39+
40+
41+
class MultiTurnSimpleCriteriaInput(BaseModel):
42+
user_input: str = Field(description="The input to the model")
43+
criteria: str = Field(description="The criteria to evaluate the response")
44+
45+
46+
class MultiTurnSimpleCriteriaWithReferenceInput(MultiTurnSimpleCriteriaInput):
47+
reference: str = Field(description="The reference response")
48+
49+
50+
class SingleTurnSimpleCriteriaPrompt(
51+
PydanticPrompt[SingleTurnSimpleCriteriaInput, SimpleCriteriaOutput]
52+
):
53+
instruction = "Given a input and response. Evaluate and score the response only using the given criteria."
54+
input_model = SingleTurnSimpleCriteriaInput
55+
output_model = SimpleCriteriaOutput
56+
examples = [
57+
(
58+
SingleTurnSimpleCriteriaInput(
59+
user_input="Who was the director of Los Alamos Laboratory?",
60+
response="Einstein was the director of Los Alamos Laboratory.",
61+
criteria="Score responses in range of 0 to 5 based on factors such as grammar, relevance, and coherence.",
62+
),
63+
SimpleCriteriaOutput(
64+
reason="The response is grammatically correct and relevant to the input.",
65+
score=5,
66+
),
67+
)
68+
]
69+
70+
71+
class SingleTurnSimpleCriteriaWithReferencePrompt(
72+
PydanticPrompt[SingleTurnSimpleCriteriaWithReferenceInput, SimpleCriteriaOutput]
73+
):
74+
instruction = "Given a input, system response and reference. Evaluate and score the response against the reference only using the given criteria."
75+
input_model = SingleTurnSimpleCriteriaWithReferenceInput
76+
output_model = SimpleCriteriaOutput
77+
examples = [
78+
(
79+
SingleTurnSimpleCriteriaWithReferenceInput(
80+
user_input="Who was the director of Los Alamos Laboratory?",
81+
response="Einstein was the director of Los Alamos Laboratory.",
82+
reference="The director of Los Alamos Laboratory was J. Robert Oppenheimer.",
83+
criteria="Score responses in range of 0 (low) to 5 (high) based similarity with reference.",
84+
),
85+
SimpleCriteriaOutput(
86+
reason="The response and reference have two very different answers.",
87+
score=0,
88+
),
89+
)
90+
]
91+
92+
93+
class MultiTurnSimpleCriteriaPrompt(
94+
PydanticPrompt[MultiTurnSimpleCriteriaInput, SimpleCriteriaOutput]
95+
):
96+
instruction = "Given an interaction between Human, AI and Tools evaluate and score the interaction using the given criteria."
97+
input_model = MultiTurnSimpleCriteriaInput
98+
output_model = SimpleCriteriaOutput
99+
examples = [
100+
(
101+
MultiTurnSimpleCriteriaInput(
102+
user_input="""Human: Hey, book a table at the nearest best Chinese restaurant for 8:00pm\nAI: Sure, let me find the best options for you.\nTools:\n restaurant_search: {'cuisine': 'Chinese', 'time': '8:00pm'}\nToolOutput: Found a few options: 1. Golden Dragon, 2. Jade Palace\nAI: I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\nHuman: Let's go with Golden Dragon.\nAI: Great choice! I'll book a table for 8:00pm at Golden Dragon.\nTools:\n restaurant_book: {'name': 'Golden Dragon', 'time': '8:00pm'}\nToolOutput: Table booked at Golden Dragon for 8:00pm.\nAI: Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\nHuman: thanks""",
103+
criteria="Score the interaction in range of 0 to 5 based on factors such as helpfulness, coherence, and relevance.",
104+
),
105+
SimpleCriteriaOutput(
106+
reason="The interaction is coherent and relevant to the user's request.",
107+
score=5,
108+
),
109+
)
110+
]
111+
112+
113+
class MultiTurnSimpleCriteriaWithReferencePrompt(
114+
PydanticPrompt[MultiTurnSimpleCriteriaWithReferenceInput, SimpleCriteriaOutput]
115+
):
116+
instruction = "Given an interaction between Human, AI and Tools evaluate and score the interaction using the given criteria."
117+
input_model = MultiTurnSimpleCriteriaWithReferenceInput
118+
output_model = SimpleCriteriaOutput
119+
examples = [
120+
(
121+
MultiTurnSimpleCriteriaWithReferenceInput(
122+
user_input="""Human: Hey, book a table at the nearest best Chinese restaurant for 8:00pm\nAI: Sure, let me find the best options for you.\nTools:\n restaurant_search: {'cuisine': 'Chinese', 'time': '8:00pm'}\nToolOutput: Found a few options: 1. Golden Dragon, 2. Jade Palace\nAI: I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\nHuman: Let's go with Golden Dragon.\nAI: Great choice! I'll book a table for 8:00pm at Golden Dragon.\nTools:\n restaurant_book: {'name': 'Golden Dragon', 'time': '8:00pm'}\nToolOutput: Table booked at Golden Dragon for 8:00pm.\nAI: Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\nHuman: thanks""",
123+
reference="The AI successfully books a table at the nearest best Chinese restaurant for 8:00pm, providing the user with options and confirming the booking.",
124+
criteria="Score the interaction in range of 0 to 5 based on factors such as helpfulness, coherence, and relevance.",
125+
),
126+
SimpleCriteriaOutput(
127+
reason="The interaction is coherent and relevant to the user's request.",
128+
score=5,
129+
),
130+
)
131+
]
132+
133+
134+
class SimpleCriteriaOutout(BaseModel):
135+
reason: str = Field(description="Reason for the score")
136+
score: int = Field(description="The score for the submission")
137+
138+
139+
class SimpleCriteriaWithoutReferenceInput(BaseModel):
140+
user_input: str = Field(description="The input to the model")
141+
response: str = Field(description="The response from the model")
142+
criteria: str = Field(description="The criteria to evaluate the response")
143+
144+
145+
@dataclass
146+
class SimpleCriteriaScoreWithoutReference(
147+
MetricWithLLM, SingleTurnMetric, MultiTurnMetric
148+
):
149+
"""
150+
Judges the submission to give binary results using the criteria specified
151+
in the metric definition.
152+
153+
Attributes
154+
----------
155+
name: str
156+
name of the metrics
157+
definition: str
158+
criteria to score the submission
159+
strictness: int
160+
The number of times self consistency checks is made. Final judgement is
161+
made using majority vote.
162+
"""
163+
164+
name: str = field(default="", repr=True) # type: ignore
165+
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
166+
default_factory=lambda: {
167+
MetricType.SINGLE_TURN: {
168+
"user_input",
169+
"response",
170+
},
171+
MetricType.MULTI_TURN: {
172+
"user_input",
173+
},
174+
}
175+
)
176+
single_turn_prompt: PydanticPrompt = field(
177+
default_factory=lambda: SingleTurnSimpleCriteriaPrompt()
178+
)
179+
multi_turn_prompt: PydanticPrompt = field(
180+
default_factory=lambda: MultiTurnSimpleCriteriaPrompt()
181+
)
182+
definition: str = field(default="", repr=True)
183+
strictness: int = field(default=1, repr=False)
184+
max_retries: int = 1
185+
186+
def __post_init__(self: t.Self):
187+
if self.name == "":
188+
raise ValueError("Expects a name")
189+
if self.definition == "":
190+
raise ValueError("Expects definition")
191+
192+
# ensure odd number of checks to avoid tie in majority vote.
193+
self.strictness = (
194+
self.strictness if self.strictness % 2 != 0 else self.strictness + 1
195+
)
196+
197+
def _compute_score(
198+
self, safe_loaded_responses: t.List[SimpleCriteriaOutput]
199+
) -> float:
200+
if self.strictness > 1:
201+
score = Counter([item.score for item in safe_loaded_responses]).most_common(
202+
1
203+
)[0][0]
204+
else:
205+
score = safe_loaded_responses[0].score
206+
207+
return score
208+
209+
async def _single_turn_ascore(
210+
self: t.Self, sample: SingleTurnSample, callbacks: Callbacks
211+
) -> float:
212+
row = sample.dict()
213+
return await self._ascore(row, callbacks)
214+
215+
async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
216+
assert self.llm is not None, "set LLM before use"
217+
218+
user_input, context, response = (
219+
row["user_input"],
220+
row.get("retrieved_contexts"),
221+
row["response"],
222+
)
223+
224+
if context is not None:
225+
if isinstance(context, list):
226+
context = "\n".join(context)
227+
user_input = f"Question: {user_input} Answer using context: {context}"
228+
229+
prompt_input = SingleTurnSimpleCriteriaInput(
230+
user_input=user_input,
231+
response=response,
232+
criteria=self.definition,
233+
)
234+
235+
response = await self.single_turn_prompt.generate(
236+
data=prompt_input,
237+
llm=self.llm,
238+
callbacks=callbacks,
239+
)
240+
241+
return self._compute_score([response])
242+
243+
async def _multi_turn_ascore(
244+
self: t.Self, sample: MultiTurnSample, callbacks: Callbacks
245+
) -> float:
246+
assert self.llm is not None, "LLM is not set"
247+
assert sample.reference is not None, "Reference is not set"
248+
249+
interaction = sample.pretty_repr()
250+
prompt_input = MultiTurnSimpleCriteriaInput(
251+
user_input=interaction,
252+
criteria=self.definition,
253+
)
254+
response = await self.multi_turn_prompt.generate(
255+
data=prompt_input,
256+
llm=self.llm,
257+
callbacks=callbacks,
258+
)
259+
return self._compute_score([response])
260+
261+
262+
@dataclass
263+
class SimpleCriteriaScoreWithReference(SimpleCriteriaScoreWithoutReference):
264+
name: str = field(default="", repr=True) # type: ignore
265+
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
266+
default_factory=lambda: {
267+
MetricType.SINGLE_TURN: {
268+
"user_input",
269+
"response",
270+
"reference",
271+
},
272+
MetricType.MULTI_TURN: {
273+
"user_input",
274+
"reference",
275+
},
276+
}
277+
)
278+
single_turn_prompt: PydanticPrompt = field(
279+
default_factory=lambda: SingleTurnSimpleCriteriaWithReferencePrompt()
280+
)
281+
multi_turn_prompt: PydanticPrompt = field(
282+
default_factory=lambda: MultiTurnSimpleCriteriaWithReferencePrompt()
283+
)
284+
285+
async def _single_turn_ascore(
286+
self, sample: SingleTurnSample, callbacks: Callbacks
287+
) -> float:
288+
assert self.llm is not None, "LLM is not set"
289+
assert sample.user_input is not None, "User input is not set"
290+
assert sample.reference is not None, "Reference is not set"
291+
assert sample.response is not None, "Response is not set"
292+
293+
prompt_input = SingleTurnSimpleCriteriaWithReferenceInput(
294+
user_input=sample.user_input,
295+
response=sample.response,
296+
reference=sample.reference,
297+
criteria=self.definition,
298+
)
299+
300+
response = await self.single_turn_prompt.generate(
301+
data=prompt_input,
302+
llm=self.llm,
303+
callbacks=callbacks,
304+
)
305+
306+
return self._compute_score([response])
307+
308+
async def _multi_turn_ascore(
309+
self, sample: MultiTurnSample, callbacks: Callbacks
310+
) -> float:
311+
assert self.llm is not None, "LLM is not set"
312+
assert sample.user_input is not None, "User input is not set"
313+
assert sample.reference is not None, "Reference is not set"
314+
315+
interaction = sample.pretty_repr()
316+
prompt_input = MultiTurnSimpleCriteriaWithReferenceInput(
317+
user_input=interaction,
318+
reference=sample.reference,
319+
criteria=self.definition,
320+
)
321+
322+
response = await self.multi_turn_prompt.generate(
323+
data=prompt_input,
324+
llm=self.llm,
325+
callbacks=callbacks,
326+
)
327+
328+
return self._compute_score([response])
329+
330+
async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
331+
sample = SingleTurnSample(**row)
332+
return await self._single_turn_ascore(sample, callbacks)

0 commit comments

Comments
 (0)