@@ -26,9 +26,46 @@ def get_evaluation_mode(ds: Dataset):
2626
2727def evaluate (
2828 dataset : Dataset ,
29- metrics : list [Metric ],
29+ metrics : list [Metric ] | None = None ,
3030) -> Result :
31- """ """
31+ """
32+ Run the evaluation on the dataset with different metrics
33+
34+ Parameters
35+ ----------
36+ dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str]]
37+ The dataset in the format of ragas which the metrics will use to score the RAG
38+ pipeline with
39+
40+ metrics : list[Metric] , optional
41+ List of metrics to use for evaluation. If not provided then ragas will run the
42+ evaluation on the best set of metrics to give a complete view.
43+
44+ Returns
45+ -------
46+ result : Result
47+ Result object containing the scores of each metric. You can use this do analysis
48+ later. If the top 3 metrics are provided then it also returns the `ragas_score`
49+ for the entire pipeline.
50+
51+ Examples
52+ --------
53+ the basic usage is as follows:
54+ ```
55+ from ragas import evaluate
56+
57+ >>> dataset
58+ Dataset({
59+ features: ['question', 'ground_truths', 'answer', 'contexts'],
60+ num_rows: 30
61+ })
62+
63+ >>> result = evaluate(dataset)
64+ >>> print(result["ragas_score"])
65+ {'ragas_score': 0.860, 'context_relavency': 0.817, 'factuality': 0.892,
66+ 'answer_relevancy': 0.874}
67+ ```
68+ """
3269 if dataset is None :
3370 raise ValueError ("Provide dataset!" )
3471
@@ -37,6 +74,11 @@ def evaluate(
3774
3875 # TODO: check if all the metrics are compatible with the evaluation mode
3976
77+ if metrics is None :
78+ from ragas .metrics import answer_relevancy , context_relevancy , factuality
79+
80+ metrics = [answer_relevancy , context_relevancy , factuality ]
81+
4082 # run the evaluation on dataset with different metrics
4183 # initialize all the models in the metrics
4284 [m .init_model () for m in metrics ]
@@ -45,12 +87,14 @@ def evaluate(
4587 for metric in metrics :
4688 scores .append (metric .score (dataset ).select_columns (metric .name ))
4789
48- return Result (concatenate_datasets (scores , axis = 1 ))
90+ return Result (scores = concatenate_datasets (scores , axis = 1 ), dataset = dataset )
4991
5092
5193@dataclass
5294class Result (dict ):
5395 scores : Dataset
96+ dataset : Dataset | None = None
97+ ragas_score : float | None = None
5498
5599 def __post_init__ (self ):
56100 values = []
@@ -77,5 +121,17 @@ def describe(self):
77121 }
78122 return description
79123
124+ def to_pandas (self , batch_size : int | None = None , batched : bool = False ):
125+ if self .dataset is None :
126+ raise ValueError ("dataset is not provided for the results class" )
127+ assert self .scores .shape [0 ] == self .dataset .shape [0 ]
128+ result_ds = concatenate_datasets ([self .dataset , self .scores ], axis = 1 )
129+
130+ return result_ds .to_pandas (batch_size = batch_size , batched = batched )
131+
80132 def __repr__ (self ) -> str :
81- return super ().__repr__ ()
133+ scores = self .copy ()
134+ ragas_score = scores .pop ("ragas_score" )
135+ score_strs = [f"'ragas_score': { ragas_score :0.3f} " ]
136+ score_strs .extend ([f"'{ k } ': { v :0.3f} " for k , v in scores .items ()])
137+ return "{" + ", " .join (score_strs ) + "}"
0 commit comments