Skip to content

Commit 7946480

Browse files
authored
metrics: add faithfulness with HHEM model (#1191)
Ragas integration for https://vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model/
1 parent bd64df9 commit 7946480

File tree

5 files changed

+79
-1
lines changed

5 files changed

+79
-1
lines changed

docs/concepts/metrics/faithfulness.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,24 @@ Let's examine how faithfulness was calculated using the low faithfulness answer:
5858
```
5959
6060
61+
## Faithfullness with HHEM 2.1 Model
62+
63+
[Vectara's HHEM 2.1](https://vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model/) is a classifier model (T5) that is trained to detect halluccinations from LLM generated text. This model can be used in second step of calculating faithfullness, ie when claims are cross-checked with the given context to determine if it can be inferred from the context. The model is free, small and opensource making it very effient to use in production use-cases. To use the model to calculate faithfulness, you can use the following code snippet:
64+
65+
```{code-block} python
66+
from datasets import Dataset
67+
from ragas.metrics import FaithulnesswithHHEM
68+
from ragas import evaluate
69+
70+
faithfulness_with_hhem = FaithulnesswithHHEM()
71+
data_samples = {
72+
'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
73+
'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
74+
'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
75+
['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
76+
}
77+
dataset = Dataset.from_dict(data_samples)
78+
score = evaluate(dataset,metrics=[faithfulness_with_hhem])
79+
score.to_pandas()
80+
81+
```

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ dynamic = ["version", "readme"]
1818
[project.optional-dependencies]
1919
all = [
2020
"sentence-transformers",
21+
"transformers",
2122
]
2223

2324
[tool.setuptools]

requirements/dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ llama_index
77
notebook
88
sphinx-autobuild
99
sentence-transformers
10+
transformers
1011
fastembed
1112
graphene
1213
fuzzywuzzy

src/ragas/metrics/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
context_utilization,
1313
)
1414
from ragas.metrics._context_recall import ContextRecall, context_recall
15-
from ragas.metrics._faithfulness import Faithfulness, faithfulness
15+
from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness
1616
from ragas.metrics._summarization import SummarizationScore, summarization_score
1717
from ragas.metrics.critique import AspectCritique
1818

@@ -21,6 +21,7 @@
2121
"answer_correctness",
2222
"Faithfulness",
2323
"faithfulness",
24+
"FaithulnesswithHHEM",
2425
"AnswerSimilarity",
2526
"answer_similarity",
2627
"ContextPrecision",

src/ragas/metrics/_faithfulness.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,4 +313,58 @@ def save(self, cache_dir: t.Optional[str] = None) -> None:
313313
self.statement_prompt.save(cache_dir)
314314

315315

316+
@dataclass
317+
class FaithulnesswithHHEM(Faithfulness):
318+
name: str = "faithfulness_with_hhem" # type: ignore
319+
320+
def __post_init__(self):
321+
try:
322+
from transformers import AutoModelForSequenceClassification
323+
except ImportError:
324+
raise ImportError(
325+
"Huggingface transformers must be installed to use this feature, try `pip install transformers`"
326+
)
327+
self.nli_classifier = AutoModelForSequenceClassification.from_pretrained(
328+
"vectara/hallucination_evaluation_model", trust_remote_code=True
329+
)
330+
super().__post_init__()
331+
332+
def _create_pairs(
333+
self, row: t.Dict, statements: t.List[str]
334+
) -> t.List[t.Tuple[str, str]]:
335+
"""
336+
create pairs of (question, answer) from the row
337+
"""
338+
premise = "\n".join(row["contexts"])
339+
pairs = [(premise, statement) for statement in statements]
340+
return pairs
341+
342+
async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
343+
"""
344+
returns the NLI score for each (q, c, a) pair
345+
"""
346+
assert self.llm is not None, "LLM is not set"
347+
348+
p_value = self._create_statements_prompt(row)
349+
statements = await self.llm.generate(
350+
p_value,
351+
callbacks=callbacks,
352+
)
353+
statements = await _statements_output_parser.aparse(
354+
statements.generations[0][0].text, p_value, self.llm, self.max_retries
355+
)
356+
357+
if statements is None:
358+
return np.nan
359+
360+
statements = [item["simpler_statements"] for item in statements.dicts()]
361+
statements = [item for sublist in statements for item in sublist]
362+
363+
assert isinstance(statements, t.List), "statements must be a list"
364+
365+
pairs = self._create_pairs(row, statements)
366+
scores = self.nli_classifier.predict(pairs).detach().numpy().round()
367+
return scores.sum() / len(scores)
368+
369+
316370
faithfulness = Faithfulness()

0 commit comments

Comments
 (0)