metrics: add faithfulness with HHEM model (#1191)

shahules786 · web-flow · commit 79464809f296 · 2024-08-13T12:16:50.000+05:30
Ragas integration for https://vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model/
diff --git a/docs/concepts/metrics/faithfulness.md b/docs/concepts/metrics/faithfulness.md
@@ -58,3 +58,24 @@ Let's examine how faithfulness was calculated using the low faithfulness answer:
     ```
 
 
+## Faithfullness with HHEM 2.1 Model
+
+[Vectara's HHEM 2.1](https://vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model/) is a classifier model (T5) that is trained to detect halluccinations from LLM generated text. This model can be used in second step of calculating faithfullness, ie when claims are cross-checked with the given context to determine if it can be inferred from the context. The model is free, small and opensource making it very effient to use in production use-cases. To use the model to calculate faithfulness, you can use the following code snippet:
+
+```{code-block} python
+from datasets import Dataset 
+from ragas.metrics import FaithulnesswithHHEM
+from ragas import evaluate
+
+faithfulness_with_hhem = FaithulnesswithHHEM()
+data_samples = {
+    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
+    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
+    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], 
+    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
+}
+dataset = Dataset.from_dict(data_samples)
+score = evaluate(dataset,metrics=[faithfulness_with_hhem])
+score.to_pandas()
+
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ dynamic = ["version", "readme"]
 [project.optional-dependencies]
 all = [
     "sentence-transformers",
+    "transformers",
 ]
 
 [tool.setuptools]
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -7,6 +7,7 @@ llama_index
 notebook
 sphinx-autobuild
 sentence-transformers
+transformers
 fastembed
 graphene
 fuzzywuzzy
diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py
@@ -12,7 +12,7 @@
     context_utilization,
 )
 from ragas.metrics._context_recall import ContextRecall, context_recall
-from ragas.metrics._faithfulness import Faithfulness, faithfulness
+from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness
 from ragas.metrics._summarization import SummarizationScore, summarization_score
 from ragas.metrics.critique import AspectCritique
 
@@ -21,6 +21,7 @@
     "answer_correctness",
     "Faithfulness",
     "faithfulness",
+    "FaithulnesswithHHEM",
     "AnswerSimilarity",
     "answer_similarity",
     "ContextPrecision",
diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py
@@ -313,4 +313,58 @@ def save(self, cache_dir: t.Optional[str] = None) -> None:
         self.statement_prompt.save(cache_dir)
 
 
+@dataclass
+class FaithulnesswithHHEM(Faithfulness):
+    name: str = "faithfulness_with_hhem"  # type: ignore
+
+    def __post_init__(self):
+        try:
+            from transformers import AutoModelForSequenceClassification
+        except ImportError:
+            raise ImportError(
+                "Huggingface transformers must be installed to use this feature, try `pip install transformers`"
+            )
+        self.nli_classifier = AutoModelForSequenceClassification.from_pretrained(
+            "vectara/hallucination_evaluation_model", trust_remote_code=True
+        )
+        super().__post_init__()
+
+    def _create_pairs(
+        self, row: t.Dict, statements: t.List[str]
+    ) -> t.List[t.Tuple[str, str]]:
+        """
+        create pairs of (question, answer) from the row
+        """
+        premise = "\n".join(row["contexts"])
+        pairs = [(premise, statement) for statement in statements]
+        return pairs
+
+    async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
+        """
+        returns the NLI score for each (q, c, a) pair
+        """
+        assert self.llm is not None, "LLM is not set"
+
+        p_value = self._create_statements_prompt(row)
+        statements = await self.llm.generate(
+            p_value,
+            callbacks=callbacks,
+        )
+        statements = await _statements_output_parser.aparse(
+            statements.generations[0][0].text, p_value, self.llm, self.max_retries
+        )
+
+        if statements is None:
+            return np.nan
+
+        statements = [item["simpler_statements"] for item in statements.dicts()]
+        statements = [item for sublist in statements for item in sublist]
+
+        assert isinstance(statements, t.List), "statements must be a list"
+
+        pairs = self._create_pairs(row, statements)
+        scores = self.nli_classifier.predict(pairs).detach().numpy().round()
+        return scores.sum() / len(scores)
+
+
 faithfulness = Faithfulness()

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@ dynamic = ["version", "readme"]`
`18`	`18`	`[project.optional-dependencies]`
`19`	`19`	`all = [`
`20`	`20`	`"sentence-transformers",`
	`21`	`+ "transformers",`
`21`	`22`	`]`
`22`	`23`
`23`	`24`	`[tool.setuptools]`