Improve metrics (#131)

shahules786 · web-flow · commit 53b626e07df9 · 2023-09-13T23:49:23.000+05:30
- Improve faithfulness prompts
- change the sentence tokenizer to a better one to improve context
recall and relevancy.
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,8 @@ dependencies = [
     "protobuf<=3.20.0",
     "langchain>=0.0.218",
     "openai",
-    "pydantic<2.0"
+    "pydantic<2.0",
+    "pysbd>=0.3.4",
 ]
 dynamic = ["version", "readme"]
 
diff --git a/src/ragas/metrics/context_relevance.py b/src/ragas/metrics/context_relevance.py
@@ -6,6 +6,7 @@
 from typing import List
 
 import numpy as np
+import pysbd
 from datasets import Dataset
 from langchain.callbacks.manager import CallbackManager, trace_as_chain_group
 from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
@@ -24,8 +25,16 @@
 )
 
 
-def sent_tokenize(sent: str) -> List[str]:
-    return [s[:-1] if s.endswith(".") else s for s in sent.strip().split(". ")]
+seg = pysbd.Segmenter(language="en", clean=False)
+
+
+def sent_tokenize(text: str) -> List[str]:
+    """
+    tokenizer text into sentences
+    """
+    sentences = seg.segment(text)
+    assert isinstance(sentences, list)
+    return sentences
 
 
 class SentenceAgreement:
@@ -85,7 +94,7 @@ class ContextRelevancy(MetricWithLLM):
         Batch size for openai completion.
     strictness : int
         Controls the number of times sentence extraction is performed to quantify
-        uncertainty from the LLM. Defaults to 2.
+        uncertainty from the LLM. Defaults to 1.
     agreement_metric : str
         "bert_score" or "jaccard_score", used to measure agreement between multiple
         samples.
@@ -96,7 +105,7 @@ class ContextRelevancy(MetricWithLLM):
     name: str = "context_relevancy"
     evaluation_mode: EvaluationMode = EvaluationMode.qc
     batch_size: int = 15
-    strictness: int = 2
+    strictness: int = 1
     agreement_metric: str = "bert_score"
     model_name: str = "cross-encoder/stsb-TinyBERT-L-4"
 
diff --git a/src/ragas/metrics/faithfulnes.py b/src/ragas/metrics/faithfulnes.py
@@ -17,7 +17,7 @@
 #################
 LONG_FORM_ANSWER_PROMPT = HumanMessagePromptTemplate.from_template(
     """\
-Given a question and answer, create one or more statements from answer.
+Given a question and answer, create one or more statements from each sentence in the given answer.
 question: Who was  Albert Einstein and what is he best known for?
 answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.
 statements:\nAlbert Einstein was born in Germany.\nAlbert Einstein was best known for his theory of relativity.

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,8 @@ dependencies = [`
`8`	`8`	`"protobuf<=3.20.0",`
`9`	`9`	`"langchain>=0.0.218",`
`10`	`10`	`"openai",`
`11`		`- "pydantic<2.0"`
	`11`	`+ "pydantic<2.0",`
	`12`	`+ "pysbd>=0.3.4",`
`12`	`13`	`]`
`13`	`14`	`dynamic = ["version", "readme"]`
`14`	`15`