22
33import json
44import typing as t
5+ from dataclasses import dataclass , field
56
7+ from datasets import Dataset as HFDataset
68from pydantic import BaseModel , field_validator
79
10+ from ragas .cost import CostCallbackHandler
811from ragas .messages import AIMessage , HumanMessage , ToolCall , ToolMessage
12+ from ragas .utils import safe_nanmean
913
1014if t .TYPE_CHECKING :
1115 from datasets import Dataset as HFDataset
1216 from pandas import DataFrame as PandasDataframe
1317
18+ from ragas .cost import TokenUsage
19+
1420
1521class BaseSample (BaseModel ):
1622 """
@@ -145,7 +151,7 @@ def validate_samples(cls, samples: t.List[BaseSample]) -> t.List[BaseSample]:
145151
146152 return samples
147153
148- def get_sample_type (self ):
154+ def get_sample_type (self ) -> t . Type [ Sample ] :
149155 """Returns the type of the samples in the dataset."""
150156 return type (self .samples [0 ])
151157
@@ -175,7 +181,7 @@ def to_hf_dataset(self) -> HFDataset:
175181 return HFDataset .from_list (self ._to_list ())
176182
177183 @classmethod
178- def from_hf_dataset (cls , dataset : HFDataset ) -> "RagasDataset[Sample]" :
184+ def from_hf_dataset (cls , dataset : HFDataset ):
179185 """Creates an EvaluationDataset from a Hugging Face Dataset."""
180186 return cls .from_list (dataset .to_list ())
181187
@@ -266,11 +272,17 @@ def __iter__(self) -> t.Iterator[Sample]: # type: ignore
266272 def __len__ (self ) -> int :
267273 return len (self .samples )
268274
269- def __getitem__ (self , idx : int ) -> Sample :
270- return self .samples [idx ]
275+ def __str__ (self ) -> str :
276+ return f"EvaluationDataset(features={ self .features ()} , len={ len (self .samples )} )"
277+
278+ def __repr__ (self ) -> str :
279+ return self .__str__ ()
280+
271281
282+ SingleTurnSampleOrMultiTurnSample = t .Union [SingleTurnSample , MultiTurnSample ]
272283
273- class EvaluationDataset (RagasDataset [BaseSample ]):
284+
285+ class EvaluationDataset (RagasDataset [SingleTurnSampleOrMultiTurnSample ]):
274286 """
275287 Represents a dataset of evaluation samples.
276288
@@ -295,6 +307,165 @@ class EvaluationDataset(RagasDataset[BaseSample]):
295307 Creates an EvaluationDataset from a list of dictionaries.
296308 from_dict(mapping)
297309 Creates an EvaluationDataset from a dictionary.
310+ from_csv(path)
311+ Creates an EvaluationDataset from a CSV file.
312+ to_csv(path)
313+ Converts the dataset to a CSV file.
314+ to_jsonl(path)
315+ Converts the dataset to a JSONL file.
316+ from_jsonl(path)
317+ Creates an EvaluationDataset from a JSONL file.
298318 """
299319
300- pass
320+ @t .overload
321+ def __getitem__ (self , idx : int ) -> SingleTurnSampleOrMultiTurnSample : ...
322+
323+ @t .overload
324+ def __getitem__ (self , idx : slice ) -> "EvaluationDataset" : ...
325+
326+ def __getitem__ (
327+ self , idx : t .Union [int , slice ]
328+ ) -> t .Union [SingleTurnSampleOrMultiTurnSample , "EvaluationDataset" ]:
329+ if isinstance (idx , int ):
330+ return self .samples [idx ]
331+ elif isinstance (idx , slice ):
332+ return type (self )(samples = self .samples [idx ])
333+ else :
334+ raise TypeError ("Index must be int or slice" )
335+
336+
337+ @dataclass
338+ class EvaluationResult :
339+ """
340+ A class to store and process the results of the evaluation.
341+
342+ Attributes
343+ ----------
344+ scores : Dataset
345+ The dataset containing the scores of the evaluation.
346+ dataset : Dataset, optional
347+ The original dataset used for the evaluation. Default is None.
348+ binary_columns : list of str, optional
349+ List of columns that are binary metrics. Default is an empty list.
350+ cost_cb : CostCallbackHandler, optional
351+ The callback handler for cost computation. Default is None.
352+ """
353+
354+ scores : t .List [t .Dict [str , t .Any ]]
355+ dataset : t .Optional [EvaluationDataset ] = None
356+ binary_columns : t .List [str ] = field (default_factory = list )
357+ cost_cb : t .Optional [CostCallbackHandler ] = None
358+
359+ def __post_init__ (self ):
360+ # transform scores from list of dicts to dict of lists
361+ self ._scores_dict = {
362+ k : [d [k ] for d in self .scores ] for k in self .scores [0 ].keys ()
363+ }
364+
365+ values = []
366+ self ._repr_dict = {}
367+ for metric_name in self ._scores_dict .keys ():
368+ value = safe_nanmean (self ._scores_dict [metric_name ])
369+ self ._repr_dict [metric_name ] = value
370+ if metric_name not in self .binary_columns :
371+ value = t .cast (float , value )
372+ values .append (value + 1e-10 )
373+
374+ def to_pandas (self , batch_size : int | None = None , batched : bool = False ):
375+ """
376+ Convert the result to a pandas DataFrame.
377+
378+ Parameters
379+ ----------
380+ batch_size : int, optional
381+ The batch size for conversion. Default is None.
382+ batched : bool, optional
383+ Whether to convert in batches. Default is False.
384+
385+ Returns
386+ -------
387+ pandas.DataFrame
388+ The result as a pandas DataFrame.
389+
390+ Raises
391+ ------
392+ ValueError
393+ If the dataset is not provided.
394+ """
395+ try :
396+ import pandas as pd
397+ except ImportError :
398+ raise ImportError (
399+ "pandas is not installed. Please install it to use this function."
400+ )
401+
402+ if self .dataset is None :
403+ raise ValueError ("dataset is not provided for the results class" )
404+ assert len (self .scores ) == len (self .dataset )
405+ # convert both to pandas dataframes and concatenate
406+ scores_df = pd .DataFrame (self .scores )
407+ dataset_df = self .dataset .to_pandas ()
408+ return pd .concat ([dataset_df , scores_df ], axis = 1 )
409+
410+ def total_tokens (self ) -> t .Union [t .List [TokenUsage ], TokenUsage ]:
411+ """
412+ Compute the total tokens used in the evaluation.
413+
414+ Returns
415+ -------
416+ list of TokenUsage or TokenUsage
417+ The total tokens used.
418+
419+ Raises
420+ ------
421+ ValueError
422+ If the cost callback handler is not provided.
423+ """
424+ if self .cost_cb is None :
425+ raise ValueError (
426+ "The evaluate() run was not configured for computing cost. Please provide a token_usage_parser function to evaluate() to compute cost."
427+ )
428+ return self .cost_cb .total_tokens ()
429+
430+ def total_cost (
431+ self ,
432+ cost_per_input_token : t .Optional [float ] = None ,
433+ cost_per_output_token : t .Optional [float ] = None ,
434+ per_model_costs : t .Dict [str , t .Tuple [float , float ]] = {},
435+ ) -> float :
436+ """
437+ Compute the total cost of the evaluation.
438+
439+ Parameters
440+ ----------
441+ cost_per_input_token : float, optional
442+ The cost per input token. Default is None.
443+ cost_per_output_token : float, optional
444+ The cost per output token. Default is None.
445+ per_model_costs : dict of str to tuple of float, optional
446+ The per model costs. Default is an empty dictionary.
447+
448+ Returns
449+ -------
450+ float
451+ The total cost of the evaluation.
452+
453+ Raises
454+ ------
455+ ValueError
456+ If the cost callback handler is not provided.
457+ """
458+ if self .cost_cb is None :
459+ raise ValueError (
460+ "The evaluate() run was not configured for computing cost. Please provide a token_usage_parser function to evaluate() to compute cost."
461+ )
462+ return self .cost_cb .total_cost (
463+ cost_per_input_token , cost_per_output_token , per_model_costs
464+ )
465+
466+ def __repr__ (self ) -> str :
467+ score_strs = [f"'{ k } ': { v :0.4f} " for k , v in self ._repr_dict .items ()]
468+ return "{" + ", " .join (score_strs ) + "}"
469+
470+ def __getitem__ (self , key : str ) -> t .List [float ]:
471+ return self ._scores_dict [key ]
0 commit comments