Skip to content

Commit a541eb6

Browse files
authored
feat: simple criteria migrated to collections (#2386)
1 parent b113f52 commit a541eb6

File tree

3 files changed

+424
-0
lines changed

3 files changed

+424
-0
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
1616
from ragas.metrics.collections._rouge_score import RougeScore
1717
from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
18+
from ragas.metrics.collections._simple_criteria import SimpleCriteria
1819
from ragas.metrics.collections._string import (
1920
DistanceMeasure,
2021
ExactMatch,
@@ -36,6 +37,7 @@
3637
"NonLLMStringSimilarity",
3738
"RougeScore",
3839
"SemanticSimilarity",
40+
"SimpleCriteria",
3941
"StringPresence",
4042
# AspectCritic helper functions
4143
"coherence",
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""SimpleCriteria metric for custom criteria-based evaluation."""
2+
3+
import typing as t
4+
from collections import Counter
5+
6+
from pydantic import BaseModel, Field
7+
8+
from ragas.metrics.collections.base import BaseMetric
9+
from ragas.metrics.result import MetricResult
10+
11+
if t.TYPE_CHECKING:
12+
from ragas.llms.base import InstructorBaseRagasLLM
13+
14+
15+
class SimpleCriteriaOutput(BaseModel):
16+
"""Output for simple criteria evaluation."""
17+
18+
reason: str = Field(description="Reason for the scoring")
19+
score: int = Field(description="The score for the submission")
20+
21+
22+
class SimpleCriteria(BaseMetric):
23+
"""
24+
Judges submissions using custom criteria with configurable scoring.
25+
26+
Usage:
27+
>>> from openai import AsyncOpenAI
28+
>>> from ragas.llms import llm_factory
29+
>>> from ragas.metrics.collections import SimpleCriteria
30+
>>>
31+
>>> # Setup dependencies
32+
>>> client = AsyncOpenAI()
33+
>>> llm = llm_factory("gpt-4o-mini", client=client)
34+
>>>
35+
>>> # Create metric instance
36+
>>> metric = SimpleCriteria(
37+
... name="clarity",
38+
... definition="Is the response clear and easy to understand?",
39+
... llm=llm,
40+
... )
41+
>>>
42+
>>> # Single evaluation
43+
>>> result = await metric.ascore(
44+
... user_input="What is machine learning?",
45+
... response="Machine learning is a subset of artificial intelligence..."
46+
... )
47+
>>> print(f"Score: {result.value}")
48+
49+
Attributes:
50+
llm: Modern instructor-based LLM for evaluation
51+
name: The metric name
52+
definition: Criteria to judge the submission
53+
strictness: Number of times self consistency checks is made (default: 1)
54+
allowed_values: Score range for numeric validation
55+
"""
56+
57+
llm: "InstructorBaseRagasLLM"
58+
59+
def __init__(
60+
self,
61+
name: str,
62+
definition: str,
63+
llm: "InstructorBaseRagasLLM",
64+
strictness: int = 1,
65+
allowed_values: t.Tuple[float, float] = (0.0, 10.0),
66+
**kwargs,
67+
):
68+
"""Initialize SimpleCriteria metric with required components."""
69+
self.llm = llm
70+
self.definition = definition
71+
self.strictness = strictness if strictness % 2 != 0 else strictness + 1
72+
73+
super().__init__(name=name, allowed_values=allowed_values, **kwargs)
74+
75+
def _build_prompt(
76+
self,
77+
user_input: t.Optional[str] = None,
78+
response: t.Optional[str] = None,
79+
retrieved_contexts: t.Optional[t.List[str]] = None,
80+
reference: t.Optional[str] = None,
81+
reference_contexts: t.Optional[t.List[str]] = None,
82+
) -> str:
83+
"""Build the evaluation prompt from inputs."""
84+
instruction = f"""Evaluate the input based on the criteria defined.
85+
Criteria Definition: {self.definition}
86+
87+
Provide your evaluation in the following format:
88+
- reason: Brief explanation for your score
89+
- score: Integer score for the submission
90+
"""
91+
92+
input_parts = []
93+
if user_input is not None:
94+
input_parts.append(f"User Input: {user_input}")
95+
if response is not None:
96+
input_parts.append(f"Response: {response}")
97+
if retrieved_contexts is not None and len(retrieved_contexts) > 0:
98+
contexts_str = "\n".join(f" - {ctx}" for ctx in retrieved_contexts)
99+
input_parts.append(f"Retrieved Contexts:\n{contexts_str}")
100+
if reference is not None:
101+
input_parts.append(f"Reference: {reference}")
102+
if reference_contexts is not None and len(reference_contexts) > 0:
103+
ref_contexts_str = "\n".join(f" - {ctx}" for ctx in reference_contexts)
104+
input_parts.append(f"Reference Contexts:\n{ref_contexts_str}")
105+
106+
input_section = "\n\n".join(input_parts) if input_parts else ""
107+
108+
return f"{instruction}\n{input_section}"
109+
110+
async def ascore(
111+
self,
112+
user_input: t.Optional[str] = None,
113+
response: t.Optional[str] = None,
114+
retrieved_contexts: t.Optional[t.List[str]] = None,
115+
reference: t.Optional[str] = None,
116+
reference_contexts: t.Optional[t.List[str]] = None,
117+
) -> MetricResult:
118+
"""
119+
Calculate simple criteria score asynchronously.
120+
121+
Args:
122+
user_input: The input to the llm system (optional)
123+
response: The response from the llm system (optional)
124+
retrieved_contexts: The retrieved contexts from the llm system (optional)
125+
reference: The reference answer for evaluation (optional)
126+
reference_contexts: The reference contexts for evaluation (optional)
127+
128+
Returns:
129+
MetricResult with score and reason
130+
"""
131+
prompt = self._build_prompt(
132+
user_input=user_input,
133+
response=response,
134+
retrieved_contexts=retrieved_contexts,
135+
reference=reference,
136+
reference_contexts=reference_contexts,
137+
)
138+
139+
scores = []
140+
reasons = []
141+
142+
for _ in range(self.strictness):
143+
result = await self.llm.agenerate(prompt, SimpleCriteriaOutput)
144+
scores.append(result.score)
145+
reasons.append(result.reason)
146+
147+
if self.strictness > 1:
148+
score = Counter(scores).most_common(1)[0][0]
149+
majority_score = score
150+
reason_idx = scores.index(majority_score)
151+
reason = reasons[reason_idx]
152+
else:
153+
score = scores[0]
154+
reason = reasons[0]
155+
156+
return MetricResult(value=float(score), reason=reason)

0 commit comments

Comments
 (0)