[FIX] - Fix for summarization edge case (#1201)

sky-2002 · shahules786 · web-flow · commit d58dc0100101 · 2024-08-27T15:09:05.000+05:30
This PR adds a fix for the issue mentioned in #1108 However I have a points to discuss @shahules786 : - I had added `conciseness_score` to penalize long summaries, but I also do not want to promote very very short and skimpy summaries, need to find a middle ground. - Is `averaging` a good way to combine `QA_score` and `conciseness_score`? - Ranking based metrics to measure quality of summarization (as mentioned by shahul in the above issue) Given the conclusions we reach based on these discussion points, I will push more commits, let's keep this PR open till we resolve these points. --------- Co-authored-by: Shahules786 <Shahules786@gmail.com>
diff --git a/docs/concepts/metrics/summarization_score.md b/docs/concepts/metrics/summarization_score.md
@@ -11,18 +11,21 @@ We compute the question-answer score using the answers, which is a list of `1`s
 \text{QA score} = \frac{|\text{correctly answered questions}|}{|\text{total questions}|}
 ````
 
-We also introduce an option to penalize larger summaries by proving a conciseness score. If this option is enabled, the final score is calculated as the average of the summarization score and the conciseness score. This conciseness scores ensures that summaries that are just copies of the text do not get a high score, because they will obviously answer all questions correctly.
+We also introduce an option to penalize larger summaries by proving a conciseness score. If this option is enabled, the final score is calculated as the weighted average of the summarization score and the conciseness score. This conciseness scores ensures that summaries that are just copies of the text do not get a high score, because they will obviously answer all questions correctly. Also, we do not want the summaries that are empty. We add a small value `1e-10` to the denominator to avoid division by zero.
 
 ```{math}
 :label: conciseness-score
-\text{conciseness score} = 1 - \frac{\text{length of summary}}{\text{length of context}}
+\text{conciseness score} = 1 - \frac{\min(\text{length of summary}, \text{length of context})}{\text{length of context} + \text{1e-10}}
 ````
 
+We also provide a coefficient `coeff`(default value 0.5) to control the weightage of the scores. 
+
 The final summarization score is then calculated as:
 
 ```{math}
 :label: summarization-score
-\text{Summarization Score} = \frac{\text{QA score} + \text{conciseness score}}{2}
+\text{Summarization Score} = \text{QA score}*\text{coeff} + \\
+\text{conciseness score}*\text{(1-coeff)}
 ````
 
 ```{hint}
@@ -61,13 +64,14 @@ The final summarization score is then calculated as:
 ## Example
 
 ```{code-block} python
-from datasets import Dataset 
 from ragas.metrics import summarization_score
 from ragas import evaluate
+from datasets import Dataset 
+
 
 data_samples = {
-    'contexts' : [[c1], [c2]],
-    'summary': [s1, s2]
+    'contexts':[["A company is launching a new product, a smartphone app designed to help users track their fitness goals. The app allows users to set daily exercise targets, log their meals, and track their water intake. It also provides personalized workout recommendations and sends motivational reminders throughout the day."]],
+    'summary':['A company is launching a fitness tracking app that helps users set exercise goals, log meals, and track water intake, with personalized workout suggestions and motivational reminders.'],
 }
 dataset = Dataset.from_dict(data_samples)
 score = evaluate(dataset,metrics=[summarization_score])
diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py
@@ -8,7 +8,7 @@
 from langchain.pydantic_v1 import BaseModel
 
 from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions
-from ragas.llms.prompt import Prompt, PromptValue
+from ragas.llms.prompt import Prompt
 from ragas.metrics.base import EvaluationMode, MetricWithLLM
 
 if t.TYPE_CHECKING:
@@ -145,65 +145,49 @@ class SummarizationScore(MetricWithLLM):
     name: str = "summary_score"  # type: ignore
     max_retries: int = 1
     length_penalty: bool = True
-    evaluation_mode: EvaluationMode = EvaluationMode.ca  # type: ignore[reportIncompatibleMethodOverride]
+    coeff: float = 0.5
+    evaluation_mode: EvaluationMode = EvaluationMode.ca  # type: ignore
     question_generation_prompt: Prompt = field(
         default_factory=lambda: TEXT_GENERATE_QUESTIONS
     )
     answer_generation_prompt: Prompt = field(
         default_factory=lambda: TEXT_GENERATE_ANSWERS
     )
-
-    def _get_extract_keyphrases_prompt(self, text) -> PromptValue:
-        return TEXT_EXTRACT_KEYPHRASES.format(text=text)
-
-    def _get_question_generation_prompt(self, text, keyphrases) -> PromptValue:
-        return TEXT_GENERATE_QUESTIONS.format(text=text, keyphrases=keyphrases)
-
-    def _get_answer_generation_prompt(
-        self, questions: t.List, summary: str
-    ) -> PromptValue:
-        return TEXT_GENERATE_ANSWERS.format(summary=summary, questions=questions)
+    extract_keyphrases_prompt: Prompt = field(
+        default_factory=lambda: TEXT_EXTRACT_KEYPHRASES
+    )
 
     async def _ascore(self, row: Dict, callbacks: Callbacks) -> float:
-        # text is the contexts provided
-        # summary is the summary generated by the model
-        # TODO: add support for the query used as well
         text: str = "\n".join(row["contexts"])
         summary: str = row["summary"]
         keyphrases = await self._extract_keyphrases(text, callbacks)
         questions = await self._get_questions(text, keyphrases, callbacks)
         answers = await self._get_answers(questions, summary, callbacks)
 
-        scores = []
+        scores = {}
         qa_score = self._compute_qa_score(answers)
-        scores.append(qa_score)
+        scores["qa_score"] = qa_score
         if self.length_penalty:
             conciseness_score = self._compute_conciseness_score(text, summary)
-            scores.append(conciseness_score)
+            scores["conciseness_score"] = conciseness_score
         return self._compute_score(scores)
 
     def _compute_score(self, scores) -> float:
-        """Returns average score of the different scores."""
-        return sum(scores) / len(scores)
+        return (
+            scores["qa_score"] * (1 - self.coeff)
+            + scores.get("conciseness_score", 0) * self.coeff
+        )
 
     def _compute_qa_score(self, answers: t.List[str]) -> float:
-        """Returns a score between 0 and 1 reflecting the fraction of
-        correct answers, ie with a value 'yes'
-        """
         correct = sum([1 for a in answers if a.lower() == "1"])
         return correct / len(answers)
 
     def _compute_conciseness_score(self, text, summary) -> float:
-        """Returns the conciseness score of the summary. This is calculated as
-        (1- relative_length_of_summary), where relative_length_of_summary is the
-        ratio of the length of the summary to the length of the original text.
-        This promotes shorter summaries.
-        """
-        return 1 - (len(summary) / len(text))
+        return 1 - min(len(summary), len(text)) / (len(text) + 1e-10)
 
     async def _extract_keyphrases(self, text: str, callbacks: Callbacks) -> t.List[str]:
         assert self.llm is not None, "LLM is not initialized"
-        p_value = self._get_extract_keyphrases_prompt(text)
+        p_value = self.extract_keyphrases_prompt.format(text=text)
         result = await self.llm.generate(
             prompt=p_value,
             callbacks=callbacks,
@@ -223,7 +207,9 @@ async def _get_questions(
         self, text: str, keyphrases: list[str], callbacks: Callbacks
     ) -> t.List[str]:
         assert self.llm is not None, "LLM is not initialized"
-        p_value = self._get_question_generation_prompt(text, keyphrases)
+        p_value = self.question_generation_prompt.format(
+            text=text, keyphrases=keyphrases
+        )
         result = await self.llm.generate(
             prompt=p_value,
             callbacks=callbacks,
@@ -244,7 +230,9 @@ async def _get_answers(
         self, questions: t.List[str], summary: str, callbacks: Callbacks
     ) -> t.List[str]:
         assert self.llm is not None, "LLM is not initialized"
-        p_value = self._get_answer_generation_prompt(questions, summary)
+        p_value = self.answer_generation_prompt.format(
+            questions=questions, summary=summary
+        )
         result = await self.llm.generate(
             prompt=p_value,
             callbacks=callbacks,
@@ -261,17 +249,19 @@ async def _get_answers(
 
         return response.answers
 
+    def adapt(self, language: str, cache_dir: str | None = None) -> None:
+        assert self.llm is not None, "set LLM before use"
 
-def adapt(self, language: str, cache_dir: str | None = None) -> None:
-    assert self.llm is not None, "set LLM before use"
-
-    logger.info(f"Adapting summarization to {language}")
-    self.question_generation_prompt = self.question_generation_prompt.adapt(
-        language, self.llm, cache_dir
-    )
-    self.answer_generation_prompt = self.answer_generation_prompt.adapt(
-        language, self.llm, cache_dir
-    )
+        logger.info(f"Adapting summarization to {language}")
+        self.question_generation_prompt = self.question_generation_prompt.adapt(
+            language, self.llm, cache_dir
+        )
+        self.answer_generation_prompt = self.answer_generation_prompt.adapt(
+            language, self.llm, cache_dir
+        )
+        self.answer_generation_prompt = self.answer_generation_prompt.adapt(
+            language, self.llm, cache_dir
+        )
 
 
 summarization_score = SummarizationScore()