feat: make Result more usefull (#39)

jjmachan · web-flow · commit 1e07768d7d46 · 2023-06-09T12:52:19.000+05:30
- added a few tests too
diff --git a/Makefile b/Makefile
@@ -28,3 +28,6 @@ run-ci: format lint type ## Running all CI checks
 run-benchmarks: ## Run benchmarks
 	@echo "Running benchmarks..."
 	@cd $(GIT_ROOT)/tests/benchmarks && python benchmark.py 
+test: ## Run tests
+	@echo "Running tests..."
+	@pytest tests/unit
diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -26,9 +26,46 @@ def get_evaluation_mode(ds: Dataset):
 
 def evaluate(
     dataset: Dataset,
-    metrics: list[Metric],
+    metrics: list[Metric] | None = None,
 ) -> Result:
-    """ """
+    """
+    Run the evaluation on the dataset with different metrics
+
+    Parameters
+    ----------
+    dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str]]
+        The dataset in the format of ragas which the metrics will use to score the RAG
+        pipeline with
+
+    metrics : list[Metric] , optional
+        List of metrics to use for evaluation. If not provided then ragas will run the
+        evaluation on the best set of metrics to give a complete view.
+
+    Returns
+    -------
+    result : Result
+        Result object containing the scores of each metric. You can use this do analysis
+        later. If the top 3 metrics are provided then it also returns the `ragas_score`
+        for the entire pipeline.
+
+    Examples
+    --------
+    the basic usage is as follows:
+    ```
+    from ragas import evaluate
+
+    >>> dataset
+    Dataset({
+        features: ['question', 'ground_truths', 'answer', 'contexts'],
+        num_rows: 30
+    })
+
+    >>> result = evaluate(dataset)
+    >>> print(result["ragas_score"])
+    {'ragas_score': 0.860, 'context_relavency': 0.817, 'factuality': 0.892,
+    'answer_relevancy': 0.874}
+    ```
+    """
     if dataset is None:
         raise ValueError("Provide dataset!")
 
@@ -37,6 +74,11 @@ def evaluate(
 
     # TODO: check if all the metrics are compatible with the evaluation mode
 
+    if metrics is None:
+        from ragas.metrics import answer_relevancy, context_relevancy, factuality
+
+        metrics = [answer_relevancy, context_relevancy, factuality]
+
     # run the evaluation on dataset with different metrics
     # initialize all the models in the metrics
     [m.init_model() for m in metrics]
@@ -45,12 +87,14 @@ def evaluate(
     for metric in metrics:
         scores.append(metric.score(dataset).select_columns(metric.name))
 
-    return Result(concatenate_datasets(scores, axis=1))
+    return Result(scores=concatenate_datasets(scores, axis=1), dataset=dataset)
 
 
 @dataclass
 class Result(dict):
     scores: Dataset
+    dataset: Dataset | None = None
+    ragas_score: float | None = None
 
     def __post_init__(self):
         values = []
@@ -77,5 +121,17 @@ def describe(self):
             }
         return description
 
+    def to_pandas(self, batch_size: int | None = None, batched: bool = False):
+        if self.dataset is None:
+            raise ValueError("dataset is not provided for the results class")
+        assert self.scores.shape[0] == self.dataset.shape[0]
+        result_ds = concatenate_datasets([self.dataset, self.scores], axis=1)
+
+        return result_ds.to_pandas(batch_size=batch_size, batched=batched)
+
     def __repr__(self) -> str:
-        return super().__repr__()
+        scores = self.copy()
+        ragas_score = scores.pop("ragas_score")
+        score_strs = [f"'ragas_score': {ragas_score:0.3f}"]
+        score_strs.extend([f"'{k}': {v:0.3f}" for k, v in scores.items()])
+        return "{" + ", ".join(score_strs) + "}"
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
@@ -14,6 +14,21 @@
 from datasets import Dataset
 
 
+def make_batches(total_size: int, batch_size: int) -> list[range]:
+    """
+    Take a total size and batch size and return a list of ranges for the batches
+    """
+    tail = total_size % batch_size
+    num_batches = floor(total_size / batch_size)
+    batches = [
+        range(i, i + batch_size) for i in range(0, batch_size * num_batches, batch_size)
+    ]
+    if tail != 0:
+        batches.append(range(batch_size * num_batches, batch_size * num_batches + tail))
+
+    return batches
+
+
 @dataclass
 class Metric(ABC):
     @property
@@ -40,18 +55,5 @@ def init_model():
     def score(self: t.Self, dataset: Dataset) -> Dataset:
         ...
 
-    def get_batches(self, dataset_size: int):
-        tail = dataset_size % self.batch_size
-        num_batches = floor(dataset_size / self.batch_size)
-        batches = [
-            range(i, i + self.batch_size)
-            for i in range(0, self.batch_size * num_batches, self.batch_size)
-        ]
-        if tail != 0:
-            batches.append(
-                range(
-                    self.batch_size * num_batches, self.batch_size * num_batches + tail
-                )
-            )
-
-        return batches
+    def get_batches(self, dataset_size: int) -> list[range]:
+        return make_batches(dataset_size, self.batch_size)
diff --git a/src/ragas/metrics/factual.py b/src/ragas/metrics/factual.py
@@ -62,7 +62,7 @@ class Factuality(Metric):
 
     @property
     def name(self):
-        return "NLI_score"
+        return "factuality"
 
     def init_model(self: t.Self):
         pass
diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py
@@ -0,0 +1,11 @@
+import pytest
+
+from ragas.metrics.base import make_batches
+
+
+@pytest.mark.parametrize(
+    "batch_size, total_size, len_expected", [(5, 10, 2), (5, 11, 3), (5, 9, 2)]
+)
+def test_make_batches(batch_size, total_size, len_expected):
+    batches = make_batches(total_size, batch_size)
+    assert len(batches) == len_expected