answer-correctness : fix edge cases (#970)

shahules786 · web-flow · commit 6cf7773bae14 · 2024-05-20T13:16:52.000+05:30
fix: #959
diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
@@ -66,7 +66,7 @@ class AnswerCorrectnessClassification(BaseModel):
                 "The sun's light plays a critical role in Earth's climate system.",
                 "Sunlight helps to drive the weather and ocean currents.",
             ],
-            "extracted_statements": AnswerCorrectnessClassification.parse_obj(
+            "classification": AnswerCorrectnessClassification.parse_obj(
                 {
                     "TP": [
                         {
@@ -114,7 +114,7 @@ class AnswerCorrectnessClassification(BaseModel):
                 "The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level.",
                 "The boiling point of water can change with altitude.",
             ],
-            "extracted_statements": AnswerCorrectnessClassification.parse_obj(
+            "classification": AnswerCorrectnessClassification.parse_obj(
                 {
                     "TP": [
                         {
@@ -134,7 +134,7 @@ class AnswerCorrectnessClassification(BaseModel):
         },
     ],
     input_keys=["question", "answer", "ground_truth"],
-    output_key="extracted_statements",
+    output_key="classification",
     output_type="json",
 )
 
@@ -231,26 +231,36 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks, is_async: bool) -> fl
                 statements[item].dicts() if statements[item] is not None else []
             )
 
-        if any(val is [] for val in statements.values()):
-            return np.nan
-
-        p_value = self.correctness_prompt.format(
-            question=question,
-            ground_truth=statements["ground_truth"],
-            answer=statements["answer"],
-        )
-        is_statement_present = await self.llm.generate(
-            p_value, callbacks=callbacks, is_async=is_async
-        )
-        result_text = is_statement_present.generations[0][0].text
+        if not all([val == [] for val in statements.values()]):
+            ground_truth = [
+                statement
+                for item in statements["ground_truth"]
+                for statement in item["simpler_statements"]
+            ]
+            answer = [
+                statement
+                for item in statements["answer"]
+                for statement in item["simpler_statements"]
+            ]
+            p_value = self.correctness_prompt.format(
+                question=question,
+                ground_truth=ground_truth,
+                answer=answer,
+            )
+            is_statement_present = await self.llm.generate(
+                p_value, callbacks=callbacks, is_async=is_async
+            )
+            result_text = is_statement_present.generations[0][0].text
 
-        answers = await _output_parser.aparse(
-            result_text, p_value, self.llm, self.max_retries
-        )
-        if answers is None:
-            return np.nan
+            answers = await _output_parser.aparse(
+                result_text, p_value, self.llm, self.max_retries
+            )
+            if answers is None:
+                return np.nan
 
-        f1_score = self._compute_statement_presence(answers)
+            f1_score = self._compute_statement_presence(answers)
+        else:
+            f1_score = 1.0
 
         if self.weights[1] == 0:
             similarity_score = 0.0