Skip to content

Commit 01b7cac

Browse files
authored
feat: small refactors and cleanups (#1493)
1 parent 9de4218 commit 01b7cac

File tree

11 files changed

+227
-167
lines changed

11 files changed

+227
-167
lines changed

docs/references/evaluation_schema.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
options:
77
members_order: "source"
88

9-
::: ragas.evaluation.Result
9+
::: ragas.evaluation.EvaluationResult
1010
options:
1111
show_root_heading: True
1212

mkdocs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ nav:
8585
- Seed Generation with Production Data: howtos/customizations/testset_generation/seed_generation_with_production_data.md
8686
- Applications:
8787
- howtos/applications/index.md
88-
- Cost Analysis: howtos/applications/cost.md
88+
- Cost Analysis: howtos/applications/_cost.md
8989
- Integrations:
9090
- howtos/integrations/index.md
9191
- Migrations:

src/ragas/dataset_schema.py

Lines changed: 177 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,21 @@
22

33
import json
44
import typing as t
5+
from dataclasses import dataclass, field
56

7+
from datasets import Dataset as HFDataset
68
from pydantic import BaseModel, field_validator
79

10+
from ragas.cost import CostCallbackHandler
811
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
12+
from ragas.utils import safe_nanmean
913

1014
if t.TYPE_CHECKING:
1115
from datasets import Dataset as HFDataset
1216
from pandas import DataFrame as PandasDataframe
1317

18+
from ragas.cost import TokenUsage
19+
1420

1521
class BaseSample(BaseModel):
1622
"""
@@ -145,7 +151,7 @@ def validate_samples(cls, samples: t.List[BaseSample]) -> t.List[BaseSample]:
145151

146152
return samples
147153

148-
def get_sample_type(self):
154+
def get_sample_type(self) -> t.Type[Sample]:
149155
"""Returns the type of the samples in the dataset."""
150156
return type(self.samples[0])
151157

@@ -175,7 +181,7 @@ def to_hf_dataset(self) -> HFDataset:
175181
return HFDataset.from_list(self._to_list())
176182

177183
@classmethod
178-
def from_hf_dataset(cls, dataset: HFDataset) -> "RagasDataset[Sample]":
184+
def from_hf_dataset(cls, dataset: HFDataset):
179185
"""Creates an EvaluationDataset from a Hugging Face Dataset."""
180186
return cls.from_list(dataset.to_list())
181187

@@ -266,11 +272,17 @@ def __iter__(self) -> t.Iterator[Sample]: # type: ignore
266272
def __len__(self) -> int:
267273
return len(self.samples)
268274

269-
def __getitem__(self, idx: int) -> Sample:
270-
return self.samples[idx]
275+
def __str__(self) -> str:
276+
return f"EvaluationDataset(features={self.features()}, len={len(self.samples)})"
277+
278+
def __repr__(self) -> str:
279+
return self.__str__()
280+
271281

282+
SingleTurnSampleOrMultiTurnSample = t.Union[SingleTurnSample, MultiTurnSample]
272283

273-
class EvaluationDataset(RagasDataset[BaseSample]):
284+
285+
class EvaluationDataset(RagasDataset[SingleTurnSampleOrMultiTurnSample]):
274286
"""
275287
Represents a dataset of evaluation samples.
276288
@@ -295,6 +307,165 @@ class EvaluationDataset(RagasDataset[BaseSample]):
295307
Creates an EvaluationDataset from a list of dictionaries.
296308
from_dict(mapping)
297309
Creates an EvaluationDataset from a dictionary.
310+
from_csv(path)
311+
Creates an EvaluationDataset from a CSV file.
312+
to_csv(path)
313+
Converts the dataset to a CSV file.
314+
to_jsonl(path)
315+
Converts the dataset to a JSONL file.
316+
from_jsonl(path)
317+
Creates an EvaluationDataset from a JSONL file.
298318
"""
299319

300-
pass
320+
@t.overload
321+
def __getitem__(self, idx: int) -> SingleTurnSampleOrMultiTurnSample: ...
322+
323+
@t.overload
324+
def __getitem__(self, idx: slice) -> "EvaluationDataset": ...
325+
326+
def __getitem__(
327+
self, idx: t.Union[int, slice]
328+
) -> t.Union[SingleTurnSampleOrMultiTurnSample, "EvaluationDataset"]:
329+
if isinstance(idx, int):
330+
return self.samples[idx]
331+
elif isinstance(idx, slice):
332+
return type(self)(samples=self.samples[idx])
333+
else:
334+
raise TypeError("Index must be int or slice")
335+
336+
337+
@dataclass
338+
class EvaluationResult:
339+
"""
340+
A class to store and process the results of the evaluation.
341+
342+
Attributes
343+
----------
344+
scores : Dataset
345+
The dataset containing the scores of the evaluation.
346+
dataset : Dataset, optional
347+
The original dataset used for the evaluation. Default is None.
348+
binary_columns : list of str, optional
349+
List of columns that are binary metrics. Default is an empty list.
350+
cost_cb : CostCallbackHandler, optional
351+
The callback handler for cost computation. Default is None.
352+
"""
353+
354+
scores: t.List[t.Dict[str, t.Any]]
355+
dataset: t.Optional[EvaluationDataset] = None
356+
binary_columns: t.List[str] = field(default_factory=list)
357+
cost_cb: t.Optional[CostCallbackHandler] = None
358+
359+
def __post_init__(self):
360+
# transform scores from list of dicts to dict of lists
361+
self._scores_dict = {
362+
k: [d[k] for d in self.scores] for k in self.scores[0].keys()
363+
}
364+
365+
values = []
366+
self._repr_dict = {}
367+
for metric_name in self._scores_dict.keys():
368+
value = safe_nanmean(self._scores_dict[metric_name])
369+
self._repr_dict[metric_name] = value
370+
if metric_name not in self.binary_columns:
371+
value = t.cast(float, value)
372+
values.append(value + 1e-10)
373+
374+
def to_pandas(self, batch_size: int | None = None, batched: bool = False):
375+
"""
376+
Convert the result to a pandas DataFrame.
377+
378+
Parameters
379+
----------
380+
batch_size : int, optional
381+
The batch size for conversion. Default is None.
382+
batched : bool, optional
383+
Whether to convert in batches. Default is False.
384+
385+
Returns
386+
-------
387+
pandas.DataFrame
388+
The result as a pandas DataFrame.
389+
390+
Raises
391+
------
392+
ValueError
393+
If the dataset is not provided.
394+
"""
395+
try:
396+
import pandas as pd
397+
except ImportError:
398+
raise ImportError(
399+
"pandas is not installed. Please install it to use this function."
400+
)
401+
402+
if self.dataset is None:
403+
raise ValueError("dataset is not provided for the results class")
404+
assert len(self.scores) == len(self.dataset)
405+
# convert both to pandas dataframes and concatenate
406+
scores_df = pd.DataFrame(self.scores)
407+
dataset_df = self.dataset.to_pandas()
408+
return pd.concat([dataset_df, scores_df], axis=1)
409+
410+
def total_tokens(self) -> t.Union[t.List[TokenUsage], TokenUsage]:
411+
"""
412+
Compute the total tokens used in the evaluation.
413+
414+
Returns
415+
-------
416+
list of TokenUsage or TokenUsage
417+
The total tokens used.
418+
419+
Raises
420+
------
421+
ValueError
422+
If the cost callback handler is not provided.
423+
"""
424+
if self.cost_cb is None:
425+
raise ValueError(
426+
"The evaluate() run was not configured for computing cost. Please provide a token_usage_parser function to evaluate() to compute cost."
427+
)
428+
return self.cost_cb.total_tokens()
429+
430+
def total_cost(
431+
self,
432+
cost_per_input_token: t.Optional[float] = None,
433+
cost_per_output_token: t.Optional[float] = None,
434+
per_model_costs: t.Dict[str, t.Tuple[float, float]] = {},
435+
) -> float:
436+
"""
437+
Compute the total cost of the evaluation.
438+
439+
Parameters
440+
----------
441+
cost_per_input_token : float, optional
442+
The cost per input token. Default is None.
443+
cost_per_output_token : float, optional
444+
The cost per output token. Default is None.
445+
per_model_costs : dict of str to tuple of float, optional
446+
The per model costs. Default is an empty dictionary.
447+
448+
Returns
449+
-------
450+
float
451+
The total cost of the evaluation.
452+
453+
Raises
454+
------
455+
ValueError
456+
If the cost callback handler is not provided.
457+
"""
458+
if self.cost_cb is None:
459+
raise ValueError(
460+
"The evaluate() run was not configured for computing cost. Please provide a token_usage_parser function to evaluate() to compute cost."
461+
)
462+
return self.cost_cb.total_cost(
463+
cost_per_input_token, cost_per_output_token, per_model_costs
464+
)
465+
466+
def __repr__(self) -> str:
467+
score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()]
468+
return "{" + ", ".join(score_strs) + "}"
469+
470+
def __getitem__(self, key: str) -> t.List[float]:
471+
return self._scores_dict[key]

0 commit comments

Comments
 (0)