Skip to content

Commit 1e07768

Browse files
authored
feat: make Result more usefull (#39)
- added a few tests too
1 parent 48ae599 commit 1e07768

File tree

5 files changed

+92
-20
lines changed

5 files changed

+92
-20
lines changed

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,6 @@ run-ci: format lint type ## Running all CI checks
2828
run-benchmarks: ## Run benchmarks
2929
@echo "Running benchmarks..."
3030
@cd $(GIT_ROOT)/tests/benchmarks && python benchmark.py
31+
test: ## Run tests
32+
@echo "Running tests..."
33+
@pytest tests/unit

src/ragas/evaluation.py

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,46 @@ def get_evaluation_mode(ds: Dataset):
2626

2727
def evaluate(
2828
dataset: Dataset,
29-
metrics: list[Metric],
29+
metrics: list[Metric] | None = None,
3030
) -> Result:
31-
""" """
31+
"""
32+
Run the evaluation on the dataset with different metrics
33+
34+
Parameters
35+
----------
36+
dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str]]
37+
The dataset in the format of ragas which the metrics will use to score the RAG
38+
pipeline with
39+
40+
metrics : list[Metric] , optional
41+
List of metrics to use for evaluation. If not provided then ragas will run the
42+
evaluation on the best set of metrics to give a complete view.
43+
44+
Returns
45+
-------
46+
result : Result
47+
Result object containing the scores of each metric. You can use this do analysis
48+
later. If the top 3 metrics are provided then it also returns the `ragas_score`
49+
for the entire pipeline.
50+
51+
Examples
52+
--------
53+
the basic usage is as follows:
54+
```
55+
from ragas import evaluate
56+
57+
>>> dataset
58+
Dataset({
59+
features: ['question', 'ground_truths', 'answer', 'contexts'],
60+
num_rows: 30
61+
})
62+
63+
>>> result = evaluate(dataset)
64+
>>> print(result["ragas_score"])
65+
{'ragas_score': 0.860, 'context_relavency': 0.817, 'factuality': 0.892,
66+
'answer_relevancy': 0.874}
67+
```
68+
"""
3269
if dataset is None:
3370
raise ValueError("Provide dataset!")
3471

@@ -37,6 +74,11 @@ def evaluate(
3774

3875
# TODO: check if all the metrics are compatible with the evaluation mode
3976

77+
if metrics is None:
78+
from ragas.metrics import answer_relevancy, context_relevancy, factuality
79+
80+
metrics = [answer_relevancy, context_relevancy, factuality]
81+
4082
# run the evaluation on dataset with different metrics
4183
# initialize all the models in the metrics
4284
[m.init_model() for m in metrics]
@@ -45,12 +87,14 @@ def evaluate(
4587
for metric in metrics:
4688
scores.append(metric.score(dataset).select_columns(metric.name))
4789

48-
return Result(concatenate_datasets(scores, axis=1))
90+
return Result(scores=concatenate_datasets(scores, axis=1), dataset=dataset)
4991

5092

5193
@dataclass
5294
class Result(dict):
5395
scores: Dataset
96+
dataset: Dataset | None = None
97+
ragas_score: float | None = None
5498

5599
def __post_init__(self):
56100
values = []
@@ -77,5 +121,17 @@ def describe(self):
77121
}
78122
return description
79123

124+
def to_pandas(self, batch_size: int | None = None, batched: bool = False):
125+
if self.dataset is None:
126+
raise ValueError("dataset is not provided for the results class")
127+
assert self.scores.shape[0] == self.dataset.shape[0]
128+
result_ds = concatenate_datasets([self.dataset, self.scores], axis=1)
129+
130+
return result_ds.to_pandas(batch_size=batch_size, batched=batched)
131+
80132
def __repr__(self) -> str:
81-
return super().__repr__()
133+
scores = self.copy()
134+
ragas_score = scores.pop("ragas_score")
135+
score_strs = [f"'ragas_score': {ragas_score:0.3f}"]
136+
score_strs.extend([f"'{k}': {v:0.3f}" for k, v in scores.items()])
137+
return "{" + ", ".join(score_strs) + "}"

src/ragas/metrics/base.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,21 @@
1414
from datasets import Dataset
1515

1616

17+
def make_batches(total_size: int, batch_size: int) -> list[range]:
18+
"""
19+
Take a total size and batch size and return a list of ranges for the batches
20+
"""
21+
tail = total_size % batch_size
22+
num_batches = floor(total_size / batch_size)
23+
batches = [
24+
range(i, i + batch_size) for i in range(0, batch_size * num_batches, batch_size)
25+
]
26+
if tail != 0:
27+
batches.append(range(batch_size * num_batches, batch_size * num_batches + tail))
28+
29+
return batches
30+
31+
1732
@dataclass
1833
class Metric(ABC):
1934
@property
@@ -40,18 +55,5 @@ def init_model():
4055
def score(self: t.Self, dataset: Dataset) -> Dataset:
4156
...
4257

43-
def get_batches(self, dataset_size: int):
44-
tail = dataset_size % self.batch_size
45-
num_batches = floor(dataset_size / self.batch_size)
46-
batches = [
47-
range(i, i + self.batch_size)
48-
for i in range(0, self.batch_size * num_batches, self.batch_size)
49-
]
50-
if tail != 0:
51-
batches.append(
52-
range(
53-
self.batch_size * num_batches, self.batch_size * num_batches + tail
54-
)
55-
)
56-
57-
return batches
58+
def get_batches(self, dataset_size: int) -> list[range]:
59+
return make_batches(dataset_size, self.batch_size)

src/ragas/metrics/factual.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ class Factuality(Metric):
6262

6363
@property
6464
def name(self):
65-
return "NLI_score"
65+
return "factuality"
6666

6767
def init_model(self: t.Self):
6868
pass

tests/unit/test_metric.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import pytest
2+
3+
from ragas.metrics.base import make_batches
4+
5+
6+
@pytest.mark.parametrize(
7+
"batch_size, total_size, len_expected", [(5, 10, 2), (5, 11, 3), (5, 9, 2)]
8+
)
9+
def test_make_batches(batch_size, total_size, len_expected):
10+
batches = make_batches(total_size, batch_size)
11+
assert len(batches) == len_expected

0 commit comments

Comments
 (0)