This commit implements the F-beta score metric (#1543)

Yuri-Albuquerque · shahules786 · web-flow · commit 6d114e5ab200 · 2024-10-25T23:48:55.000+05:30
for the AnswerCorrectness class. The beta parameter is introduced to control the relative importance of recall and precision when calculating the score. Specifically: - beta > 1 places more emphasis on recall. - beta < 1 favors precision. - beta ==1 stands for the regular F1 score that can be interpreted as a harmonic mean of the precision and recall. Key Changes: The method _compute_statement_presence is updated to calculate the F-beta score based on true positives (TP), false positives (FP), and false negatives (FN). This ensures that we can balance between recall and precision, depending on the task's requirements, by tuning the beta value. source: https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.fbeta_score.html --------- Co-authored-by: Shahules786 <Shahules786@gmail.com>
diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py
@@ -1,6 +1,3 @@
-import inspect
-import sys
-
 from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness
 from ragas.metrics._answer_relevance import (
     AnswerRelevancy,
@@ -120,10 +117,3 @@
     "MultiModalRelevance",
     "multimodal_relevance",
 ]
-
-current_module = sys.modules[__name__]
-ALL_METRICS = [
-    obj
-    for name, obj in inspect.getmembers(current_module)
-    if name in __all__ and not inspect.isclass(obj) and not inspect.isbuiltin(obj)
-]
diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
@@ -21,6 +21,7 @@
     SingleTurnMetric,
     get_segmenter,
 )
+from ragas.metrics.utils import fbeta_score
 from ragas.prompt import PydanticPrompt
 from ragas.run_config import RunConfig
 
@@ -167,6 +168,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric):
         default_factory=LongFormAnswerPrompt
     )
     weights: list[float] = field(default_factory=lambda: [0.75, 0.25])
+    beta: float = 1.0
     answer_similarity: t.Optional[AnswerSimilarity] = None
     sentence_segmenter: t.Optional[HasSegmentMethod] = None
     max_retries: int = 1
@@ -185,6 +187,11 @@ def __post_init__(self: t.Self):
             language = self.long_form_answer_prompt.language
             self.sentence_segmenter = get_segmenter(language=language, clean=False)
 
+        if type(self.beta) is not float:
+            raise ValueError(
+                "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
+            )
+
     def init(self, run_config: RunConfig):
         super().init(run_config)
         if self.answer_similarity is None and self.weights[1] != 0:
@@ -198,7 +205,7 @@ def _compute_statement_presence(
         tp = len(prediction.TP)
         fp = len(prediction.FP)
         fn = len(prediction.FN)
-        score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0
+        score = fbeta_score(tp, fp, fn, self.beta)
         return score
 
     async def _create_simplified_statements(
diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py
@@ -16,6 +16,7 @@
     SingleTurnMetric,
     get_segmenter,
 )
+from ragas.metrics.utils import fbeta_score
 from ragas.prompt import PydanticPrompt
 
 if t.TYPE_CHECKING:
@@ -181,11 +182,32 @@ class ClaimDecompositionPrompt(
 
 @dataclass
 class FactualCorrectness(MetricWithLLM, SingleTurnMetric):
+    """
+    FactualCorrectness is a metric class that evaluates the factual correctness of responses
+    generated by a language model. It uses claim decomposition and natural language inference (NLI)
+    to verify the claims made in the responses against reference texts.
+
+    Attributes:
+        name (str): The name of the metric, default is "factual_correctness".
+        _required_columns (Dict[MetricType, Set[str]]): A dictionary specifying the required columns
+            for each metric type. Default is {"SINGLE_TURN": {"response", "reference"}}.
+        mode (Literal["precision", "recall", "f1"]): The mode of evaluation, can be "precision",
+            "recall", or "f1". Default is "f1".
+        beta (float): The beta value used for the F1 score calculation. A beta > 1 gives more weight
+            to recall, while beta < 1 favors precision. Default is 1.0.
+        atomicity (Literal["low", "high"]): The level of atomicity for claim decomposition. Default is "low".
+        coverage (Literal["low", "high"]): The level of coverage for claim decomposition. Default is "low".
+        claim_decomposition_prompt (PydanticPrompt): The prompt used for claim decomposition.
+        nli_prompt (PydanticPrompt): The prompt used for natural language inference (NLI).
+
+    """
+
     name: str = "factual_correctness"  # type: ignore
     _required_columns: t.Dict[MetricType, t.Set[str]] = field(
         default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}}
     )
     mode: t.Literal["precision", "recall", "f1"] = "f1"
+    beta: float = 1.0
     atomicity: t.Literal["low", "high"] = "low"
     coverage: t.Literal["low", "high"] = "low"
     claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt()
@@ -204,6 +226,11 @@ def __post_init__(self):
             )
         self.segmenter = get_segmenter(language="english")
 
+        if type(self.beta) is not float:
+            raise ValueError(
+                "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
+            )
+
     async def decompose_claims(
         self, response: str, callbacks: Callbacks
     ) -> t.List[str]:
@@ -253,21 +280,20 @@ async def _single_turn_ascore(
         else:
             response_reference = np.array([])
 
-        true_positives = sum(reference_response)
-        false_positives = sum(~reference_response)
+        tp = sum(reference_response)
+        fp = sum(~reference_response)
         if self.mode != "precision":
-            false_negatives = sum(~response_reference)
+            fn = sum(~response_reference)
         else:
-            false_negatives = 0
+            fn = 0
+            
 
         if self.mode == "precision":
-            score = true_positives / (true_positives + false_positives + 1e-8)
+            score = tp / (tp + fp + 1e-8)
         elif self.mode == "recall":
-            score = true_positives / (true_positives + false_negatives + 1e-8)
+            score = tp / (tp + fp + 1e-8)
         else:
-            precision = true_positives / (true_positives + false_positives + 1e-8)
-            recall = true_positives / (true_positives + false_negatives + 1e-8)
-            score = 2 * (precision * recall) / (precision + recall + 1e-8)
+            score = fbeta_score(tp, fp, fn, self.beta)
 
         return np.round(score, 2)
 
diff --git a/src/ragas/metrics/utils.py b/src/ragas/metrics/utils.py
@@ -1,21 +1,22 @@
-from ragas.dataset_schema import EvaluationDataset
-from ragas.metrics import ALL_METRICS
-from ragas.metrics.base import Metric
-from ragas.validation import validate_required_columns
+def fbeta_score(tp, fp, fn, beta=1.0):
+    if tp + fp == 0:
+        precision = 0
+    else:
+        precision = tp / (tp + fp)
 
+    if tp + fn == 0:
+        recall = 0
+    else:
+        recall = tp / (tp + fn)
 
-def get_available_metrics(ds: EvaluationDataset) -> list[Metric]:
-    """
-    Get the available metrics for the given dataset.
-    E.g. if the dataset contains ("question", "answer", "contexts") columns,
-    the available metrics are those that can be evaluated in [qa, qac, qc] mode.
-    """
-    available_metrics = []
-    for metric in ALL_METRICS:
-        try:
-            validate_required_columns(ds, [metric])
-            available_metrics.append(metric)
-        except ValueError:
-            pass
+    if precision == 0 and recall == 0:
+        return 0.0
 
-    return available_metrics
+    beta_squared = beta**2
+    fbeta = (
+        (1 + beta_squared)
+        * (precision * recall)
+        / ((beta_squared * precision) + recall)
+    )
+
+    return fbeta
diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py
@@ -1,22 +1,8 @@
 import typing as t
 from dataclasses import dataclass, field
 
-from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
+from ragas.dataset_schema import SingleTurnSample
 from ragas.metrics.base import MetricType
-from ragas.metrics.utils import get_available_metrics
-
-
-def test_get_available_metrics():
-    sample1 = SingleTurnSample(user_input="What is X", response="Y")
-    sample2 = SingleTurnSample(user_input="What is Z", response="W")
-    ds = EvaluationDataset(samples=[sample1, sample2])
-
-    assert all(
-        [
-            m.required_columns["SINGLE_TURN"] == {"response", "user_input"}
-            for m in get_available_metrics(ds)
-        ]
-    ), "All metrics should have required columns ('user_input', 'response')"
 
 
 def test_single_turn_metric():