feat: improve quality of answer correctness (#339)

shahules786 · web-flow · commit 4d01af29a2ec · 2023-12-07T11:34:14.000+05:30
diff --git a/experiments/assesments/metrics_assesments.ipynb b/experiments/assesments/metrics_assesments.ipynb
@@ -98,46 +98,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 17,
    "id": "b3139189",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\n",
-      "  \"role\": \"assistant\",\n",
-      "  \"content\": \"How can I assist you today?\"\n",
-      "}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os\n",
-    "import openai\n",
-    "\n",
-    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
-    "\n",
-    "completion = openai.ChatCompletion.create(\n",
-    "    model=\"gpt-3.5-turbo\",\n",
-    "    messages=[\n",
-    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
-    "    ],\n",
-    ")\n",
+    "from openai import OpenAI\n",
     "\n",
-    "print(completion.choices[0].message)"
+    "client = OpenAI()\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 18,
    "id": "4bce4c53",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def llm2(prompt, **kwargs):\n",
-    "    response = openai.ChatCompletion.create(\n",
+    "def llm(prompt, **kwargs):\n",
+    "    response = client.chat.completions.create(\n",
     "        model=kwargs.get(\"model\", \"gpt-3.5-turbo\"),\n",
     "        messages=[{\"role\": \"system\", \"content\": prompt}],\n",
     "        temperature=kwargs.get(\"temperature\", 0),\n",
@@ -147,27 +127,12 @@
     "        max_tokens=kwargs.get(\"max_tokens\", 500),\n",
     "        n=kwargs.get(\"n\", 1),\n",
     "    )\n",
-    "    return response\n",
-    "\n",
-    "\n",
-    "def llm(prompt, **kwargs):\n",
-    "    response = openai.Completion.create(\n",
-    "        model=kwargs.get(\"model\", \"text-davinci-003\"),\n",
-    "        prompt=prompt,\n",
-    "        temperature=kwargs.get(\"temperature\", 0),\n",
-    "        top_p=kwargs.get(\"top_p\", 1),\n",
-    "        frequency_penalty=kwargs.get(\"frequency_penalty\", 0.0),\n",
-    "        presence_penalty=kwargs.get(\"presence_penalty\", 0.0),\n",
-    "        max_tokens=kwargs.get(\"max_tokens\", 500),\n",
-    "        logprobs=kwargs.get(\"logprobs\", 0),\n",
-    "        n=kwargs.get(\"n\", 1),\n",
-    "    )\n",
     "    return response"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 19,
    "id": "4d9b4e31",
    "metadata": {},
    "outputs": [],
@@ -2341,11 +2306,78 @@
     "results.to_dict()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "387bb6ea",
+   "metadata": {},
+   "source": [
+    "## Answer correctness"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "47465fd1",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/ragas/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from ragas.metrics import answer_correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "76b13fc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {\"question\":\"Where is France and what's it capital?\", \"answer\":\"Asia\",\n",
+    "        'ground_truths':[\"France is in Europe and it's capital is Paris\"]}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "817f4150",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "faith [0.0]\n",
+      "sim [True]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.5"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "answer_correctness.score_single(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50b595cf",
+   "metadata": {},
    "outputs": [],
    "source": []
   }
diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
@@ -5,13 +5,47 @@
 
 import numpy as np
 from datasets import Dataset
+from langchain.callbacks.manager import CallbackManager, trace_as_chain_group
+from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
 
 from ragas.metrics._answer_similarity import AnswerSimilarity
-from ragas.metrics._faithfulness import Faithfulness
 from ragas.metrics.base import EvaluationMode, MetricWithLLM
+from ragas.utils import load_as_json
 
-if t.TYPE_CHECKING:
-    from langchain.callbacks.manager import CallbackManager
+CORRECTNESS_PROMPT = HumanMessagePromptTemplate.from_template(
+    """
+Extract following from given question and ground truth
+
+Question:What powers the sun and what is its primary function?
+Answer: The sun is powered by nuclear fission, similar to nuclear reactors on Earth, and its primary function is to provide light to the solar system.
+Ground truth: The sun is actually powered by nuclear fusion, not fission. In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy. This energy is what lights up the sun and provides heat and light, essential for life on Earth. The sun's light also plays a critical role in Earth's climate system and helps to drive the weather and ocean currents.
+Extracted statements:
+[
+{{
+  "statements that are present in both the answer and the ground truth": ["The sun's primary function is to provide light"],
+  "statements present in the answer but not found in the ground truth": ["The sun is powered by nuclear fission", "similar to nuclear reactors on Earth"],
+  "relevant statements found in the ground truth but omitted in the answer": ["The sun is powered by nuclear fusion, not fission", "In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy", "This energy provides heat and light, essential for life on Earth", "The sun's light plays a critical role in Earth's climate system", "The sun helps to drive the weather and ocean currents"]
+}}
+]
+
+Question: What is the boiling point of water?
+Answer: The boiling point of water is 100 degrees Celsius at sea level.
+Ground truth: The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level, but it can change with altitude.
+Extracted statements:
+[
+  {{
+    "statements that are present in both the answer and the ground truth": ["The boiling point of water is 100 degrees Celsius at sea level"],
+    "statements present in the answer but not found in the ground truth": [],
+    "relevant statements found in the ground truth but omitted in the answer": ["The boiling point can change with altitude", "The boiling point of water is 212 degrees Fahrenheit at sea level"]
+  }}
+]
+
+
+Question:{question}
+Answer: {answer}
+Ground truth: {ground_truth}
+Extracted statements:"""  # noqa: E501
+)
 
 
 @dataclass
@@ -39,34 +73,61 @@ class AnswerCorrectness(MetricWithLLM):
     name: str = "answer_correctness"
     evaluation_mode: EvaluationMode = EvaluationMode.qga
     batch_size: int = 15
-    weights: list[float] = field(default_factory=lambda: [0.5, 0.5])
+    weights: list[float] = field(default_factory=lambda: [0.75, 0.25])
     answer_similarity: AnswerSimilarity | None = None
-    faithfulness: Faithfulness | None = None
 
     def __post_init__(self: t.Self):
         if self.answer_similarity is None:
             self.answer_similarity = AnswerSimilarity(
                 llm=self.llm, batch_size=self.batch_size
             )
-        if self.faithfulness is None:
-            self.faithfulness = Faithfulness(llm=self.llm, batch_size=self.batch_size)
 
     def _score_batch(
         self: t.Self,
         dataset: Dataset,
         callbacks: t.Optional[CallbackManager] = None,
         callback_group_name: str = "batch",
     ) -> list[float]:
-        if "contexts" in dataset.column_names:
-            ds_faithfulness = dataset.remove_columns(["contexts"])
-        else:
-            ds_faithfulness = dataset
+        question, answer, ground_truths = (
+            dataset["question"],
+            dataset["answer"],
+            dataset["ground_truths"],
+        )
+        prompts = []
+
+        with trace_as_chain_group(
+            callback_group_name, callback_manager=callbacks
+        ) as batch_group:
+            for q, a, g in zip(question, answer, ground_truths):
+                human_prompt = CORRECTNESS_PROMPT.format(
+                    question=q, ground_truth=g[0], answer=a
+                )
+                prompts.append(ChatPromptTemplate.from_messages([human_prompt]))
+
+        result = self.llm.generate(prompts, callbacks=batch_group)
+        outputs = result.generations
+        key_map = {
+            "TP": "statements that are present in both the answer and the ground truth",
+            "FP": "statements present in the answer but not found in the ground truth",
+            "FN": "relevant statements found in the ground truth but omitted in the answer",  # noqa: E501
+        }
+
+        f1_score = []
+        for prediction in outputs:
+            prediction = load_as_json(prediction[0].text)
+            prediction = [
+                item.get(key_map[k], np.nan)
+                for item in prediction
+                for k in key_map.keys()
+            ]
+            tp, fp, fn = [
+                len(item) if isinstance(item, list) else np.nan for item in prediction
+            ]
+            score = tp / (tp + 0.5 * (fp + fn))
+            f1_score.append(score)
 
-        ds_faithfulness = ds_faithfulness.rename_columns({"ground_truths": "contexts"})
-        faith_scores = self.faithfulness._score_batch(ds_faithfulness)  # type: ignore
         similarity_scores = self.answer_similarity._score_batch(dataset)  # type: ignore
-
-        scores_stacked = np.vstack([faith_scores, similarity_scores])
+        scores_stacked = np.vstack([f1_score, similarity_scores])
         scores = np.average(
             scores_stacked,
             axis=0,