Skip to content

Commit e9fb710

Browse files
chore: move rubrics based metric into single file (#1287)
```python from ragas.dataset_schema import SingleTurnSample from ragas.metrics._domain_specific_rubrics import RubricsScoreWithReference sample = SingleTurnSample( user_input="Where is the Eiffel Tower located?", response="The Eiffel Tower is located in Paris.", reference="The Eiffel Tower is located in Paris.", ) rubrics = { "score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.", "score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.", "score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.", "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.", "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.", } scorer = RubricsScoreWithReference(rubrics=rubrics) scorer.llm = openai_model await scorer.single_turn_ascore(sample) ``` --------- Co-authored-by: Jithin James <[email protected]>
1 parent d219159 commit e9fb710

File tree

5 files changed

+163
-198
lines changed

5 files changed

+163
-198
lines changed

src/ragas/metrics/__init__.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,19 @@
1616
context_utilization,
1717
)
1818
from ragas.metrics._context_recall import ContextRecall, context_recall
19+
from ragas.metrics._domain_specific_rubrics import (
20+
RubricsScoreWithoutReference,
21+
RubricsScoreWithReference,
22+
rubrics_score_with_reference,
23+
rubrics_score_without_reference,
24+
)
1925
from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness
2026
from ragas.metrics._noise_sensitivity import (
2127
NoiseSensitivity,
2228
noise_sensitivity_irrelevant,
2329
noise_sensitivity_relevant,
2430
)
2531
from ragas.metrics._summarization import SummarizationScore, summarization_score
26-
from ragas.metrics.domain_specific_rubrics import (
27-
RubricsScoreWithoutReference,
28-
RubricsScoreWithReference,
29-
rubrics_score_with_reference,
30-
rubrics_score_without_reference,
31-
)
3232

3333
__all__ = [
3434
"AnswerCorrectness",

src/ragas/metrics/domain_specific_rubrics/with_reference.py renamed to src/ragas/metrics/_domain_specific_rubrics.py

Lines changed: 149 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@
2121
logger = logging.getLogger(__name__)
2222

2323

24+
DEFAULT_REFERENCE_FREE_RUBRICS = {
25+
"score1_description": "The response is incorrect or does not answer the question.",
26+
"score2_description": "The response is partially correct but may include errors or incomplete information.",
27+
"score3_description": "The response is generally correct but lacks clarity or completeness.",
28+
"score4_description": "The response is correct and clear, with minor issues or missing details.",
29+
"score5_description": "The response is completely accurate, clear, and answers the question directly.",
30+
}
31+
2432
DEFAULT_WITH_REFERENCE_RUBRICS = {
2533
"score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.",
2634
"score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.",
@@ -35,30 +43,156 @@ class ScoreFeedback(BaseModel):
3543
score: int = Field(..., description="The score given to the response")
3644

3745

38-
class SingleTurnWithRefernceInput(BaseModel):
46+
class SingleTurnWithoutReferenceInput(BaseModel):
47+
user_input: str = Field(..., description="The user input")
48+
response: str = Field(..., description="The response")
49+
rubrics: t.Dict[str, str] = Field(..., description="The rubric")
50+
51+
52+
class MultiTurnWithoutReferenceInput(BaseModel):
53+
user_input: str = Field(..., description="The user input")
54+
rubrics: t.Dict[str, str] = Field(..., description="The rubric")
55+
56+
57+
class SingleTurnWithoutReferencePrompt(
58+
PydanticPrompt[SingleTurnWithoutReferenceInput, ScoreFeedback]
59+
):
60+
instruction = """Given an user_input (which might contain an input along with it), a response to evaluate, and a score rubric representing evaluation criteria are given.
61+
1. Write detailed feedback that assesses the quality of the response strictly based on the given score rubric, without evaluating in general.
62+
2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric."""
63+
input_model = SingleTurnWithoutReferenceInput
64+
output_model = ScoreFeedback
65+
examples = [
66+
(
67+
SingleTurnWithoutReferenceInput(
68+
user_input="What is the capital of France?",
69+
response="The capital of France is Paris.",
70+
rubrics=DEFAULT_REFERENCE_FREE_RUBRICS,
71+
),
72+
ScoreFeedback(
73+
feedback="The response is completely accurate and directly answers the question about the capital of France.",
74+
score=5,
75+
),
76+
)
77+
]
78+
79+
80+
class MultiTurnWithoutReferencePrompt(
81+
PydanticPrompt[MultiTurnWithoutReferenceInput, ScoreFeedback]
82+
):
83+
instruction = """Given an interaction between AI,Human and external Tool as input and reference that's desired outcome that get's a score of 5,and a score rubric representing evaluation criteria are given.
84+
1. Write detailed feedback that assesses the quality of the responselet strictly based on the given score rubric, without evaluating in general.
85+
2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric."""
86+
input_model = MultiTurnWithoutReferenceInput
87+
output_model = ScoreFeedback
88+
examples = [
89+
(
90+
MultiTurnWithoutReferenceInput(
91+
user_input="""Human: Hey, book a table at the nearest best Chinese restaurant for 8:00pm\nAI: Sure, let me find the best options for you.\nTools:\n restaurant_search: {'cuisine': 'Chinese', 'time': '8:00pm'}\nToolOutput: Found a few options: 1. Golden Dragon, 2. Jade Palace\nAI: I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\nHuman: Let's go with Golden Dragon.\nAI: Great choice! I'll book a table for 8:00pm at Golden Dragon.\nTools:\n restaurant_book: {'name': 'Golden Dragon', 'time': '8:00pm'}\nToolOutput: Table booked at Golden Dragon for 8:00pm.\nAI: Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\nHuman: thanks""",
92+
rubrics=DEFAULT_REFERENCE_FREE_RUBRICS,
93+
),
94+
ScoreFeedback(feedback="", score=5),
95+
)
96+
]
97+
98+
99+
@dataclass
100+
class RubricsScoreWithoutReference(MetricWithLLM, SingleTurnMetric, MultiTurnMetric):
101+
name: str = "rubrics_score_without_reference" # type: ignore
102+
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
103+
default_factory=lambda: {
104+
MetricType.SINGLE_TURN: {"user_input", "response"},
105+
MetricType.MULTI_TURN: {
106+
"user_input",
107+
},
108+
}
109+
)
110+
rubrics: t.Dict[str, str] = field(
111+
default_factory=lambda: DEFAULT_REFERENCE_FREE_RUBRICS
112+
)
113+
max_retries: int = 1
114+
115+
def __post_init__(self):
116+
self.single_turn_scoring_prompt = SingleTurnWithoutReferencePrompt()
117+
self.multi_turn_scoring_prompt = MultiTurnWithoutReferencePrompt()
118+
self.rubrics = self.rubrics or DEFAULT_REFERENCE_FREE_RUBRICS
119+
120+
async def _single_turn_ascore(
121+
self, sample: SingleTurnSample, callbacks: Callbacks
122+
) -> float:
123+
return await self._ascore(sample.dict(), callbacks)
124+
125+
async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
126+
assert self.llm is not None, "LLM is not set"
127+
128+
prompt_input = self._create_single_turn_prompt(row)
129+
output = await self.single_turn_scoring_prompt.generate(
130+
data=prompt_input,
131+
llm=self.llm,
132+
callbacks=callbacks,
133+
)
134+
return output.score
135+
136+
async def _multi_turn_ascore(
137+
self, sample: MultiTurnSample, callbacks: Callbacks
138+
) -> float:
139+
assert self.llm is not None, "LLM is not set"
140+
141+
interaction = sample.pretty_repr()
142+
prompt_input = MultiTurnWithoutReferenceInput(
143+
user_input=interaction,
144+
rubrics=self.rubrics,
145+
)
146+
output = await self.multi_turn_scoring_prompt.generate(
147+
data=prompt_input,
148+
llm=self.llm,
149+
callbacks=callbacks,
150+
)
151+
return output.score
152+
153+
def _create_single_turn_prompt(
154+
self, row: t.Dict
155+
) -> SingleTurnWithoutReferenceInput:
156+
question, contexts, answer = (
157+
row["user_input"],
158+
row.get("retrieved_contexts"),
159+
row["response"],
160+
)
161+
if contexts:
162+
contexts = "\n".join(contexts)
163+
question = f"{question} answer using context: {contexts}"
164+
165+
return SingleTurnWithoutReferenceInput(
166+
user_input=question,
167+
response=answer,
168+
rubrics=self.rubrics,
169+
)
170+
171+
172+
class SingleTurnWithReferenceInput(BaseModel):
39173
user_input: str = Field(..., description="The user input")
40174
response: str = Field(..., description="The response")
41175
reference: str = Field(..., description="The reference")
42176
rubrics: t.Dict[str, str] = Field(..., description="The rubric")
43177

44178

45-
class MultiTurnWithRefernceInput(BaseModel):
179+
class MultiTurnWithReferenceInput(BaseModel):
46180
user_input: str = Field(..., description="The user input")
47181
reference: str = Field(..., description="The reference")
48182
rubrics: t.Dict[str, str] = Field(..., description="The rubric")
49183

50184

51185
class SingleTurnWithReferencePrompt(
52-
PydanticPrompt[SingleTurnWithRefernceInput, ScoreFeedback]
186+
PydanticPrompt[SingleTurnWithReferenceInput, ScoreFeedback]
53187
):
54-
instruction = """Given an interaction between AI,Human and external Tool as input and reference that's desired outcome that get's a score of 5,and a score rubric representing evaluation criteria are given.
55-
1. Write detailed feedback that assesses the quality of the responselet strictly based on the given score rubric, without evaluating in general.
188+
instruction = """Given user input, response and reference that's desired outcome that get's a score of 5,and a score rubric representing evaluation criteria are given.
189+
1. Write detailed feedback that assesses the quality of the response strictly based on the given score rubric, without evaluating in general.
56190
2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric."""
57-
input_model = SingleTurnWithRefernceInput
191+
input_model = SingleTurnWithReferenceInput
58192
output_model = ScoreFeedback
59193
examples = [
60194
(
61-
SingleTurnWithRefernceInput(
195+
SingleTurnWithReferenceInput(
62196
user_input="What is the capital of France?",
63197
response="The capital of France is Paris.",
64198
reference="The capital of France is Paris.",
@@ -73,16 +207,16 @@ class SingleTurnWithReferencePrompt(
73207

74208

75209
class MultiTurnWithReferencePrompt(
76-
PydanticPrompt[MultiTurnWithRefernceInput, ScoreFeedback]
210+
PydanticPrompt[MultiTurnWithReferenceInput, ScoreFeedback]
77211
):
78212
instruction = """Given an interaction between AI,Human and external Tool as input and reference that's desired outcome that get's a score of 5,and a score rubric representing evaluation criteria are given.
79213
1. Write detailed feedback that assesses the quality of the responselet strictly based on the given score rubric, without evaluating in general.
80214
2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric."""
81-
input_model = MultiTurnWithRefernceInput
215+
input_model = MultiTurnWithReferenceInput
82216
output_model = ScoreFeedback
83217
examples = [
84218
(
85-
MultiTurnWithRefernceInput(
219+
MultiTurnWithReferenceInput(
86220
user_input="""Human: Hey, book a table at the nearest best Chinese restaurant for 8:00pm\nAI: Sure, let me find the best options for you.\nTools:\n restaurant_search: {'cuisine': 'Chinese', 'time': '8:00pm'}\nToolOutput: Found a few options: 1. Golden Dragon, 2. Jade Palace\nAI: I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\nHuman: Let's go with Golden Dragon.\nAI: Great choice! I'll book a table for 8:00pm at Golden Dragon.\nTools:\n restaurant_book: {'name': 'Golden Dragon', 'time': '8:00pm'}\nToolOutput: Table booked at Golden Dragon for 8:00pm.\nAI: Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\nHuman: thanks""",
87221
reference="The AI successfully books a table at the nearest best Chinese restaurant for 8:00pm, providing the user with options and confirming the booking.",
88222
rubrics=DEFAULT_WITH_REFERENCE_RUBRICS,
@@ -148,15 +282,15 @@ async def _multi_turn_ascore(
148282
)
149283
return output.score
150284

151-
def _create_multi_turn_prompt(self, row: t.Dict) -> MultiTurnWithRefernceInput:
285+
def _create_multi_turn_prompt(self, row: t.Dict) -> MultiTurnWithReferenceInput:
152286
interaction, reference = row["interaction"], row["reference"]
153-
return MultiTurnWithRefernceInput(
287+
return MultiTurnWithReferenceInput(
154288
user_input=interaction,
155289
reference=reference,
156290
rubrics=self.rubrics,
157291
)
158292

159-
def _create_single_turn_prompt(self, row: t.Dict) -> SingleTurnWithRefernceInput:
293+
def _create_single_turn_prompt(self, row: t.Dict) -> SingleTurnWithReferenceInput:
160294
question, contexts, answer, ground_truth = (
161295
row["user_input"],
162296
row.get("retrieved_contexts"),
@@ -167,7 +301,7 @@ def _create_single_turn_prompt(self, row: t.Dict) -> SingleTurnWithRefernceInput
167301
contexts = "\n".join(contexts)
168302
question = f"{question} answer using context: {contexts}"
169303

170-
return SingleTurnWithRefernceInput(
304+
return SingleTurnWithReferenceInput(
171305
user_input=question,
172306
response=answer,
173307
reference=ground_truth,
@@ -176,3 +310,4 @@ def _create_single_turn_prompt(self, row: t.Dict) -> SingleTurnWithRefernceInput
176310

177311

178312
rubrics_score_with_reference = RubricsScoreWithReference()
313+
rubrics_score_without_reference = RubricsScoreWithoutReference()

src/ragas/metrics/base.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ class Metric(ABC):
6060

6161
@property
6262
@abstractmethod
63-
def name(self) -> str: ...
63+
def name(self) -> str:
64+
...
6465

6566
@property
6667
def required_columns(self) -> t.Dict[str, t.Set[str]]:
@@ -147,7 +148,8 @@ async def ascore(
147148
return score
148149

149150
@abstractmethod
150-
async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: ...
151+
async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
152+
...
151153

152154

153155
@dataclass
@@ -254,7 +256,8 @@ async def _single_turn_ascore(
254256
self,
255257
sample: SingleTurnSample,
256258
callbacks: Callbacks,
257-
) -> float: ...
259+
) -> float:
260+
...
258261

259262

260263
class MultiTurnMetric(Metric):
@@ -306,7 +309,8 @@ async def _multi_turn_ascore(
306309
self,
307310
sample: MultiTurnSample,
308311
callbacks: Callbacks,
309-
) -> float: ...
312+
) -> float:
313+
...
310314

311315

312316
class Ensember:

src/ragas/metrics/domain_specific_rubrics/__init__.py

Lines changed: 0 additions & 15 deletions
This file was deleted.

0 commit comments

Comments
 (0)