feat: Ragas CI/CD (#976)

jjmachan · web-flow · commit 8d658a16492c · 2024-05-21T16:00:05.000+05:30
new feature based on Ragas reproducability docs: https://ragas--976.org.readthedocs.build/en/976/howtos/applications/add_to_ci.html#
diff --git a/docs/getstarted/index.md b/docs/getstarted/index.md
@@ -41,4 +41,4 @@ Find out how to evaluate your RAG pipeline using your test set (your own dataset
 :link-type: ref
 
 Discover how to monitor the performance and quality of your RAG application in production.
-:::
+:::
diff --git a/docs/howtos/applications/add_to_ci.md b/docs/howtos/applications/add_to_ci.md
@@ -0,0 +1,113 @@
+# Adding to your CI pipeline with Pytest
+
+You can add Ragas evaluations as part of your Continious Integration pipeline 
+to keep track of the qualitative performance of your RAG pipeline. Consider these as 
+part of your end-to-end test suite which you run before major changes and releases.
+
+The usage is straight forward but the main things is to set the `in_ci` argument for the
+`evaluate()` function to `True`. This runs Ragas metrics in a special mode that ensures 
+it produces more reproducable metrics but will be more costlier.
+
+You can easily write a pytest test as follows
+
+:::{note}
+This dataset that is already populated with outputs from a reference RAG
+When testing your own system make sure you use outputs from RAG pipeline 
+you want to test. For more information on how to build your datasets check 
+[Building HF `Dataset` with your own Data](./data_preparation.md) docs.
+:::
+
+```{code-block} python
+:caption: tests/e2e/test_amnesty_e2e.py
+:linenos:
+import pytest
+from datasets import load_dataset
+
+from ragas import evaluate
+from ragas.metrics import (
+    answer_relevancy,
+    faithfulness,
+    context_recall,
+    context_precision,
+)
+
+def assert_in_range(score: float, value: float, plus_or_minus: float):
+    """
+    Check if computed score is within the range of value +/- max_range
+    """
+    assert value - plus_or_minus <= score <= value + plus_or_minus
+
+
+def test_amnesty_e2e():
+    # loading the V2 dataset
+    amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")["eval"]
+
+
+    result = evaluate(
+        amnesty_qa,
+        metrics=[answer_relevancy, faithfulness, context_recall, context_precision],
+        in_ci=True,
+    )
+    assert result["answer_relevancy"] >= 0.9
+    assert result["context_recall"] >= 0.95
+    assert result["context_precision"] >= 0.95
+    assert_in_range(result["faithfulness"], value=0.4, plus_or_minus=0.1)
+```
+
+## Using Pytest Markers for Ragas E2E tests
+
+Because these are long end-to-end test one thing that you can leverage is [Pytest Markers](https://docs.pytest.org/en/latest/example/markers.html) which help you mark your tests with special tags. It is recommended to mark Ragas tests with special tags so you can run them only when needed.
+
+To add a new `ragas_ci` tag to pytest add the following to your `conftest.py`
+```{code-block} python
+:caption: conftest.py
+def pytest_configure(config):
+    """
+    configure pytest
+    """
+    # add `ragas_ci`
+    config.addinivalue_line(
+        "markers", "ragas_ci: Set of tests that will be run as part of Ragas CI"
+    )
+```
+
+now you can use `ragas_ci` to mark all the tests that are part of Ragas CI.
+
+```{code-block} python
+:caption: tests/e2e/test_amnesty_e2e.py
+:linenos:
+:emphasize-added: 19
+import pytest
+from datasets import load_dataset
+
+from ragas import evaluate
+from ragas.metrics import (
+    answer_relevancy,
+    faithfulness,
+    context_recall,
+    context_precision,
+)
+
+def assert_in_range(score: float, value: float, plus_or_minus: float):
+    """
+    Check if computed score is within the range of value +/- max_range
+    """
+    assert value - plus_or_minus <= score <= value + plus_or_minus
+
+
+@pytest.mark.ragas_ci
+def test_amnesty_e2e():
+    # loading the V2 dataset
+    amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")["eval"]
+
+
+    result = evaluate(
+        amnesty_qa,
+        metrics=[answer_relevancy, faithfulness, context_recall, context_precision],
+        in_ci=True,
+    )
+    assert result["answer_relevancy"] >= 0.9
+    assert result["context_recall"] >= 0.95
+    assert result["context_precision"] >= 0.95
+    assert_in_range(result["faithfulness"], value=0.4, plus_or_minus=0.1)
+```
diff --git a/docs/howtos/applications/data_preparation.md b/docs/howtos/applications/data_preparation.md
@@ -1,4 +1,4 @@
-# Prepare data for evaluation
+# Building HF Dataset with your own Data
 
 This tutorial notebook provides a step-by-step guide on how to prepare data for experimenting and evaluating using ragas. 
 
@@ -27,4 +27,4 @@ data_samples = {
     'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
 }
 dataset = Dataset.from_dict(data_samples)
-```
+```
diff --git a/docs/howtos/applications/index.md b/docs/howtos/applications/index.md
@@ -12,4 +12,5 @@ compare_llms
 custom_prompts
 use_prompt_adaptation
 tracing
+add_to_ci
 ```
diff --git a/src/ragas/_analytics.py b/src/ragas/_analytics.py
@@ -88,6 +88,7 @@ class EvaluationEvent(BaseEvent):
     evaluation_mode: str
     num_rows: int
     language: str
+    in_ci: bool
 
 
 class TestsetGenerationEvent(BaseEvent):
diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -20,7 +20,12 @@
 from ragas.llms import llm_factory
 from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper
 from ragas.metrics._answer_correctness import AnswerCorrectness
-from ragas.metrics.base import Metric, MetricWithEmbeddings, MetricWithLLM
+from ragas.metrics.base import (
+    Metric,
+    MetricWithEmbeddings,
+    MetricWithLLM,
+    is_reproducable,
+)
 from ragas.metrics.critique import AspectCritique
 from ragas.run_config import RunConfig
 from ragas.utils import get_feature_language
@@ -43,6 +48,7 @@ def evaluate(
     llm: t.Optional[BaseRagasLLM | LangchainLLM] = None,
     embeddings: t.Optional[BaseRagasEmbeddings | LangchainEmbeddings] = None,
     callbacks: Callbacks = None,
+    in_ci: bool = False,
     is_async: bool = True,
     run_config: t.Optional[RunConfig] = None,
     raise_exceptions: bool = True,
@@ -71,7 +77,11 @@ def evaluate(
         Lifecycle Langchain Callbacks to run during evaluation. Check the
         [langchain documentation](https://python.langchain.com/docs/modules/callbacks/)
         for more information.
-    is_async: bool, optional
+    in_ci: bool
+        Whether the evaluation is running in CI or not. If set to True then some
+        metrics will be run to increase the reproducability of the evaluations. This
+        will increase the runtime and cost of evaluations. Default is False.
+    is_async: bool
         Whether to run the evaluation in async mode or not. If set to True then the
         evaluation is run by calling the `metric.ascore` method. In case the llm or
         embeddings does not support async then the evaluation can be run in sync mode
@@ -156,9 +166,12 @@ def evaluate(
     binary_metrics = []
     llm_changed: t.List[int] = []
     embeddings_changed: t.List[int] = []
+    reproducable_metrics: t.List[int] = []
     answer_correctness_is_set = -1
 
+    # loop through the metrics and perform initializations
     for i, metric in enumerate(metrics):
+        # set llm and embeddings if not set
         if isinstance(metric, AspectCritique):
             binary_metrics.append(metric.name)
         if isinstance(metric, MetricWithLLM) and metric.llm is None:
@@ -174,9 +187,15 @@ def evaluate(
         if isinstance(metric, AnswerCorrectness):
             if metric.answer_similarity is None:
                 answer_correctness_is_set = i
+        # set reproducibility for metrics if in CI
+        if in_ci and is_reproducable(metric):
+            if metric.reproducibility == 1:  # type: ignore
+                # only set a value if not already set
+                metric.reproducibility = 3  # type: ignore
+                reproducable_metrics.append(i)
 
-    # initialize all the models in the metrics
-    [m.init(run_config) for m in metrics]
+        # init all the models
+        metric.init(run_config)
 
     executor = Executor(
         desc="Evaluating",
@@ -248,6 +267,9 @@ def evaluate(
                 AnswerCorrectness, metrics[answer_correctness_is_set]
             ).answer_similarity = None
 
+        for i in reproducable_metrics:
+            metrics[i].reproducibility = 1  # type: ignore
+
     # log the evaluation event
     metrics_names = [m.name for m in metrics]
     metric_lang = [get_feature_language(m) for m in metrics]
@@ -259,6 +281,7 @@ def evaluate(
             evaluation_mode="",
             num_rows=dataset.shape[0],
             language=metric_lang[0] if len(metric_lang) > 0 else "",
+            in_ci=in_ci,
         )
     )
     return result
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
@@ -61,13 +61,11 @@ def get_required_columns(
 class Metric(ABC):
     @property
     @abstractmethod
-    def name(self) -> str:
-        ...
+    def name(self) -> str: ...
 
     @property
     @abstractmethod
-    def evaluation_mode(self) -> EvaluationMode:
-        ...
+    def evaluation_mode(self) -> EvaluationMode: ...
 
     @abstractmethod
     def init(self, run_config: RunConfig):
@@ -129,8 +127,9 @@ async def ascore(
         return score
 
     @abstractmethod
-    async def _ascore(self, row: t.Dict, callbacks: Callbacks, is_async: bool) -> float:
-        ...
+    async def _ascore(
+        self, row: t.Dict, callbacks: Callbacks, is_async: bool
+    ) -> float: ...
 
 
 @dataclass
@@ -219,4 +218,8 @@ def get_segmenter(
     )
 
 
+def is_reproducable(metric: Metric) -> bool:
+    return hasattr(metric, "_reproducibility")
+
+
 ensembler = Ensember()
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -11,6 +11,16 @@
     from ragas.llms.prompt import PromptValue
 
 
+def pytest_configure(config):
+    """
+    configure pytest
+    """
+    # adda
+    config.addinivalue_line(
+        "markers", "ragas_ci: Set of tests that will be run as part of Ragas CI"
+    )
+
+
 class FakeTestLLM(BaseRagasLLM):
     def llm(self):
         return self
diff --git a/tests/e2e/test_amnesty_in_ci.py b/tests/e2e/test_amnesty_in_ci.py
@@ -0,0 +1,38 @@
+import pytest
+from datasets import load_dataset
+
+from ragas import evaluate
+from ragas.metrics import (
+    answer_relevancy,
+    faithfulness,
+    context_recall,
+    context_precision,
+)
+
+# loading the V2 dataset
+amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")["eval"]
+
+
+def assert_in_range(score: float, value: float, plus_or_minus: float):
+    """
+    Check if computed score is within the range of value +/- max_range
+    """
+    assert value - plus_or_minus <= score <= value + plus_or_minus
+
+
+@pytest.mark.ragas_ci
+def test_amnesty_e2e():
+    result = evaluate(
+        amnesty_qa,
+        metrics=[answer_relevancy, faithfulness, context_recall, context_precision],
+        in_ci=True,
+    )
+    assert result["answer_relevancy"] >= 0.9
+    assert result["context_recall"] >= 0.95
+    assert result["context_precision"] >= 0.95
+    assert_in_range(result["faithfulness"], value=0.4, plus_or_minus=0.1)
+
+
+@pytest.mark.ragas_ci
+def test_assert_in_range():
+    assert_in_range(0.5, value=0.1, plus_or_minus=0.1)
diff --git a/tests/unit/test_analytics.py b/tests/unit/test_analytics.py
@@ -26,13 +26,15 @@ def test_evaluation_event():
         num_rows=1,
         evaluation_mode="",
         language="english",
+        in_ci=True,
     )
 
     payload = dict(evaluation_event)
     assert isinstance(payload.get("user_id"), str)
     assert isinstance(payload.get("evaluation_mode"), str)
     assert isinstance(payload.get("metrics"), list)
     assert isinstance(payload.get("language"), str)
+    assert isinstance(payload.get("in_ci"), bool)
 
 
 def setup_user_id_filepath(tmp_path, monkeypatch):

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Prepare data for evaluation`
	`1`	`+# Building HF Dataset with your own Data`
`2`	`2`
`3`	`3`	`This tutorial notebook provides a step-by-step guide on how to prepare data for experimenting and evaluating using ragas.`
`4`	`4`
`@@ -27,4 +27,4 @@ data_samples = {`
`27`	`27`	`'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']`
`28`	`28`	`}`
`29`	`29`	`dataset = Dataset.from_dict(data_samples)`
`30`		-```
	`30`	+```