Skip to content

Commit 8f2f1df

Browse files
authored
refactor: improve metrics code quality (#2337)
## Changes Made <!-- Describe what you changed and why --> - add type hints, docstrings, fix typos, and remove duplicate create_nano_id
1 parent 49f47f1 commit 8f2f1df

File tree

5 files changed

+317
-66
lines changed

5 files changed

+317
-66
lines changed

src/ragas/backends/utils.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,6 @@
33
from __future__ import annotations
44

55
import random
6-
import string
7-
import typing as t
8-
import uuid
9-
10-
11-
def create_nano_id(size: int = 12) -> str:
12-
"""Create a short, URL-safe unique identifier."""
13-
# Define characters to use (alphanumeric)
14-
alphabet = string.ascii_letters + string.digits
15-
16-
# Generate UUID and convert to int
17-
uuid_int = t.cast(int, uuid.uuid4().int)
18-
19-
# Convert to base62
20-
result = ""
21-
while uuid_int:
22-
uuid_int, remainder = divmod(uuid_int, len(alphabet))
23-
result = alphabet[remainder] + result
24-
25-
# Pad if necessary and return desired length
26-
return result[:size]
276

287

298
class MemorableNames:

src/ragas/metrics/base.py

Lines changed: 113 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,16 @@ def get_required_columns(
139139
return self.required_columns
140140

141141
@abstractmethod
142-
def init(self, run_config: RunConfig): ...
142+
def init(self, run_config: RunConfig) -> None:
143+
"""
144+
Initialize the metric with the given run configuration.
145+
146+
Parameters
147+
----------
148+
run_config : RunConfig
149+
Configuration for the metric run including timeouts and other settings.
150+
"""
151+
...
143152

144153
@deprecated("0.2", removal="0.3", alternative="single_turn_ascore")
145154
def score(self, row: t.Dict, callbacks: Callbacks = None) -> float:
@@ -229,10 +238,23 @@ class MetricWithLLM(Metric, PromptMixin):
229238
llm: t.Optional[BaseRagasLLM] = None
230239
output_type: t.Optional[MetricOutputType] = None
231240

232-
def init(self, run_config: RunConfig):
241+
def init(self, run_config: RunConfig) -> None:
242+
"""
243+
Initialize the metric with run configuration and validate LLM is present.
244+
245+
Parameters
246+
----------
247+
run_config : RunConfig
248+
Configuration for the metric run.
249+
250+
Raises
251+
------
252+
ValueError
253+
If no LLM is provided to the metric.
254+
"""
233255
if self.llm is None:
234256
raise ValueError(
235-
f"Metric '{self.name}' has no valid LLM provided (self.llm is None). Please initantiate a the metric with an LLM to run." # noqa
257+
f"Metric '{self.name}' has no valid LLM provided (self.llm is None). Please instantiate the metric with an LLM to run."
236258
)
237259
self.llm.set_run_config(run_config)
238260

@@ -735,29 +757,106 @@ class ModeMetric(t.Protocol):
735757

736758
@dataclass
737759
class SimpleBaseMetric(ABC):
738-
"""Base class for simple metrics that return MetricResult objects."""
760+
"""
761+
Base class for simple metrics that return MetricResult objects.
762+
763+
This class provides the foundation for metrics that evaluate inputs
764+
and return structured MetricResult objects containing scores and reasoning.
765+
766+
Attributes
767+
----------
768+
name : str
769+
The name of the metric.
770+
allowed_values : AllowedValuesType
771+
Allowed values for the metric output. Can be a list of strings for
772+
discrete metrics, a tuple of floats for numeric metrics, or an integer
773+
for ranking metrics.
774+
775+
Examples
776+
--------
777+
>>> from ragas.metrics import discrete_metric
778+
>>>
779+
>>> @discrete_metric(name="sentiment", allowed_values=["positive", "negative"])
780+
>>> def sentiment_metric(user_input: str, response: str) -> str:
781+
... return "positive" if "good" in response else "negative"
782+
>>>
783+
>>> result = sentiment_metric(user_input="How are you?", response="I'm good!")
784+
>>> print(result.value) # "positive"
785+
"""
739786

740787
name: str
741788
allowed_values: AllowedValuesType = field(default_factory=lambda: ["pass", "fail"])
742789

743790
@abstractmethod
744791
def score(self, **kwargs) -> "MetricResult":
792+
"""
793+
Synchronously calculate the metric score.
794+
795+
Parameters
796+
----------
797+
**kwargs : dict
798+
Input parameters required by the specific metric implementation.
799+
800+
Returns
801+
-------
802+
MetricResult
803+
The evaluation result containing the score and reasoning.
804+
"""
745805
pass
746806

747807
@abstractmethod
748808
async def ascore(self, **kwargs) -> "MetricResult":
809+
"""
810+
Asynchronously calculate the metric score.
811+
812+
Parameters
813+
----------
814+
**kwargs : dict
815+
Input parameters required by the specific metric implementation.
816+
817+
Returns
818+
-------
819+
MetricResult
820+
The evaluation result containing the score and reasoning.
821+
"""
749822
pass
750823

751824
def batch_score(
752825
self,
753826
inputs: t.List[t.Dict[str, t.Any]],
754827
) -> t.List["MetricResult"]:
828+
"""
829+
Synchronously calculate scores for a batch of inputs.
830+
831+
Parameters
832+
----------
833+
inputs : List[Dict[str, Any]]
834+
List of input dictionaries, each containing parameters for the metric.
835+
836+
Returns
837+
-------
838+
List[MetricResult]
839+
List of evaluation results, one for each input.
840+
"""
755841
return [self.score(**input_dict) for input_dict in inputs]
756842

757843
async def abatch_score(
758844
self,
759845
inputs: t.List[t.Dict[str, t.Any]],
760846
) -> t.List["MetricResult"]:
847+
"""
848+
Asynchronously calculate scores for a batch of inputs in parallel.
849+
850+
Parameters
851+
----------
852+
inputs : List[Dict[str, Any]]
853+
List of input dictionaries, each containing parameters for the metric.
854+
855+
Returns
856+
-------
857+
List[MetricResult]
858+
List of evaluation results, one for each input.
859+
"""
761860
async_tasks = []
762861
for input_dict in inputs:
763862
# Process input asynchronously
@@ -767,29 +866,30 @@ async def abatch_score(
767866
return await asyncio.gather(*async_tasks)
768867

769868

770-
def create_auto_response_model(name: str, **fields):
771-
"""Create a response model and mark it as auto-generated by Ragas.
869+
def create_auto_response_model(name: str, **fields) -> t.Type["BaseModel"]:
870+
"""
871+
Create a response model and mark it as auto-generated by Ragas.
772872
773873
This function creates a Pydantic model using create_model and marks it
774874
with a special attribute to indicate it was auto-generated. This allows
775875
the save() method to distinguish between auto-generated models (which
776876
are recreated on load) and custom user models.
777877
778-
Parameters:
779-
-----------
878+
Parameters
879+
----------
780880
name : str
781881
Name for the model class
782882
**fields
783-
Field definitions in create_model format
883+
Field definitions in create_model format.
784884
Each field is specified as: field_name=(type, default_or_field_info)
785885
786-
Returns:
787-
--------
886+
Returns
887+
-------
788888
Type[BaseModel]
789889
Pydantic model class marked as auto-generated
790890
791-
Examples:
792-
---------
891+
Examples
892+
--------
793893
>>> from pydantic import Field
794894
>>> # Simple model with required fields
795895
>>> ResponseModel = create_auto_response_model(

src/ragas/metrics/discrete.py

Lines changed: 68 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,33 @@
1717

1818
@dataclass(repr=False)
1919
class DiscreteMetric(SimpleLLMMetric, DiscreteValidator):
20+
"""
21+
Metric for categorical/discrete evaluations with predefined allowed values.
22+
23+
This class is used for metrics that output categorical values like
24+
"pass/fail", "good/bad/excellent", or custom discrete categories.
25+
26+
Attributes
27+
----------
28+
allowed_values : List[str]
29+
List of allowed categorical values the metric can output.
30+
Default is ["pass", "fail"].
31+
32+
Examples
33+
--------
34+
>>> from ragas.metrics import DiscreteMetric
35+
>>> from ragas.llms import LangchainLLMWrapper
36+
>>> from langchain_openai import ChatOpenAI
37+
>>>
38+
>>> # Create a custom discrete metric
39+
>>> llm = LangchainLLMWrapper(ChatOpenAI())
40+
>>> metric = DiscreteMetric(
41+
... name="quality_check",
42+
... llm=llm,
43+
... allowed_values=["excellent", "good", "poor"]
44+
... )
45+
"""
46+
2047
allowed_values: t.List[str] = field(default_factory=lambda: ["pass", "fail"])
2148

2249
def __post_init__(self):
@@ -27,8 +54,8 @@ def __post_init__(self):
2754

2855
self._response_model = create_auto_response_model(
2956
"DiscreteResponseModel",
30-
reason=(str, Field(..., description="Reaoning for the value")),
31-
value=(t.Literal[values], Field(..., description="the value predicted")),
57+
reason=(str, Field(..., description="Reasoning for the value")),
58+
value=(t.Literal[values], Field(..., description="The value predicted")),
3259
)
3360

3461
def get_correlation(
@@ -88,18 +115,47 @@ def discrete_metric(
88115
*,
89116
name: t.Optional[str] = None,
90117
allowed_values: t.Optional[t.List[str]] = None,
91-
**metric_params,
118+
**metric_params: t.Any,
92119
) -> t.Callable[[t.Callable[..., t.Any]], DiscreteMetricProtocol]:
93120
"""
94-
Decorator for creating discrete metrics.
95-
96-
Args:
97-
name: Optional name for the metric (defaults to function name)
98-
allowed_values: List of allowed string values for the metric
99-
**metric_params: Additional parameters for the metric
100-
101-
Returns:
102-
A decorator that transforms a function into a DiscreteMetric instance
121+
Decorator for creating discrete/categorical metrics.
122+
123+
This decorator transforms a regular function into a DiscreteMetric instance
124+
that can be used for evaluation with predefined categorical outputs.
125+
126+
Parameters
127+
----------
128+
name : str, optional
129+
Name for the metric. If not provided, uses the function name.
130+
allowed_values : List[str], optional
131+
List of allowed categorical values for the metric output.
132+
Default is ["pass", "fail"].
133+
**metric_params : Any
134+
Additional parameters to pass to the metric initialization.
135+
136+
Returns
137+
-------
138+
Callable[[Callable[..., Any]], DiscreteMetricProtocol]
139+
A decorator that transforms a function into a DiscreteMetric instance.
140+
141+
Examples
142+
--------
143+
>>> from ragas.metrics import discrete_metric
144+
>>>
145+
>>> @discrete_metric(name="sentiment", allowed_values=["positive", "neutral", "negative"])
146+
>>> def sentiment_analysis(user_input: str, response: str) -> str:
147+
... '''Analyze sentiment of the response.'''
148+
... if "great" in response.lower() or "good" in response.lower():
149+
... return "positive"
150+
... elif "bad" in response.lower() or "poor" in response.lower():
151+
... return "negative"
152+
... return "neutral"
153+
>>>
154+
>>> result = sentiment_analysis(
155+
... user_input="How was your day?",
156+
... response="It was great!"
157+
... )
158+
>>> print(result.value) # "positive"
103159
"""
104160
if allowed_values is None:
105161
allowed_values = ["pass", "fail"]

0 commit comments

Comments
 (0)