make metrics work with new llm interface

shahules786 · shahules786 · commit a164263ae082 · 2025-03-25T11:20:36.000-07:00
diff --git a/nbs/metric/base.ipynb b/nbs/metric/base.ipynb
@@ -43,14 +43,14 @@
     "from pydantic import BaseModel\n",
     "import typing as t\n",
     "from ragas_annotator.metric import MetricResult\n",
-    "from ragas_annotator.metric import LLM\n",
+    "from ragas_annotator.llm import RagasLLM\n",
     "\n",
     "@dataclass\n",
     "class Metric(ABC):\n",
     "    \"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\n",
     "    name: str\n",
     "    prompt: str\n",
-    "    llm: LLM\n",
+    "    llm: RagasLLM\n",
     "    _response_models: t.Dict[bool, t.Type[BaseModel]] = field(\n",
     "        default_factory=dict, init=False, repr=False\n",
     "    )\n",
@@ -136,7 +136,13 @@
    ]
   }
  ],
- "metadata": {},
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
  "nbformat": 4,
  "nbformat_minor": 2
 }
diff --git a/nbs/metric/decorator.ipynb b/nbs/metric/decorator.ipynb
@@ -30,6 +30,7 @@
     "import asyncio\n",
     "from dataclasses import dataclass\n",
     "from ragas_annotator.metric import MetricResult\n",
+    "from ragas_annotator.llm import RagasLLM\n",
     "\n",
     "\n",
     "\n",
@@ -44,7 +45,7 @@
     "    Returns:\n",
     "        A decorator factory function for the specified metric type\n",
     "    \"\"\"\n",
-    "    def decorator_factory(llm, prompt, name: t.Optional[str] = None, **metric_params):\n",
+    "    def decorator_factory(llm:RagasLLM, prompt, name: t.Optional[str] = None, **metric_params):\n",
     "        \"\"\"\n",
     "        Creates a decorator that wraps a function into a metric instance.\n",
     "        \n",
@@ -149,6 +150,29 @@
     "### Example usage\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "# Load environment variables from a .envrc file\n",
+    "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -168,12 +192,16 @@
     "\n",
     "\n",
     "from ragas_annotator.metric import DiscreteMetric\n",
-    "from ragas_annotator.metric.llm import LLM\n",
     "from pydantic import BaseModel\n",
     "\n",
+    "from ragas_annotator.llm import ragas_llm\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "llm = ragas_llm(provider=\"openai\",model=\"gpt-4o\",client=OpenAI())\n",
+    "\n",
     "discrete_metric = create_metric_decorator(DiscreteMetric)\n",
     "\n",
-    "@discrete_metric(llm=LLM(),\n",
+    "@discrete_metric(llm=llm,\n",
     "    prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
     "    name='new_metric',values=[\"low\",\"med\",\"high\"])\n",
     "def my_metric(llm,prompt,**kwargs):\n",
diff --git a/nbs/metric/discrete.ipynb b/nbs/metric/discrete.ipynb
@@ -21,7 +21,16 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "#| export\n",
     "import typing as t\n",
@@ -81,6 +90,29 @@
     "## Example usage"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "# Load environment variables from a .envrc file\n",
+    "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -91,18 +123,22 @@
      "output_type": "stream",
      "text": [
       "low\n",
-      "No context or content was provided for evaluation.\n"
+      "The response does not provide any specific information or context that can help evaluate its helpfulness.\n"
      ]
     }
    ],
    "source": [
     "\n",
     "#| eval: false\n",
     "\n",
-    "from ragas_annotator.metric.llm import LLM\n",
+    "from ragas_annotator.llm import ragas_llm\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "llm = ragas_llm(provider=\"openai\",model=\"gpt-4o\",client=OpenAI())\n",
+    "\n",
     "\n",
     "my_metric = DiscreteMetric(\n",
-    "    llm=LLM(),\n",
+    "    llm=llm,\n",
     "    name='helpfulness',\n",
     "    prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
     "    values=[\"low\",\"med\",\"high\"],\n",
@@ -130,14 +166,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "low\n",
+      "high\n",
       "reason\n"
      ]
     }
    ],
    "source": [
     "#| eval: false\n",
-    "@discrete_metric(llm=LLM(),\n",
+    "@discrete_metric(llm=llm,\n",
     "    prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
     "    name='new_metric',values=[\"low\",\"med\",\"high\"])\n",
     "def my_metric(llm,prompt,**kwargs):\n",
diff --git a/nbs/metric/numeric.ipynb b/nbs/metric/numeric.ipynb
@@ -21,7 +21,16 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "#| export\n",
     "\n",
@@ -81,7 +90,7 @@
     {
      "data": {
       "text/plain": [
-       "'The response does not provide any context or information that can be evaluated as helpful.'"
+       "True"
       ]
      },
      "execution_count": null,
@@ -90,13 +99,41 @@
     }
    ],
    "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "# Load environment variables from a .envrc file\n",
+    "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"The provided input lacks context or content to determine if it is helpful as it merely states 'this is my response' without any additional information.\""
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
     "#| eval: false\n",
     "\n",
-    "from ragas_annotator.metric.llm import LLM\n",
+    "from ragas_annotator.llm import ragas_llm\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "llm = ragas_llm(provider=\"openai\",model=\"gpt-4o\",client=OpenAI())\n",
+    "\n",
     "\n",
     "my_metric = NumericMetric(\n",
     "    name='helpfulness',\n",
-    "    llm=LLM(),\n",
+    "    llm=llm,\n",
     "    prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
     "    range=(0,10),\n",
     ")\n",
@@ -122,7 +159,7 @@
     {
      "data": {
       "text/plain": [
-       "20"
+       "10"
       ]
      },
      "execution_count": null,
@@ -134,7 +171,7 @@
     "\n",
     "#| eval: false\n",
     "\n",
-    "@numeric_metric(llm=LLM(),\n",
+    "@numeric_metric(llm=llm,\n",
     "    prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
     "    name='new_metric',range=(0,10))\n",
     "def my_metric(llm,prompt,**kwargs):\n",
diff --git a/nbs/metric/ranking.ipynb b/nbs/metric/ranking.ipynb
@@ -21,7 +21,16 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "#| export\n",
     "\n",
@@ -108,6 +117,29 @@
     "### Example usage"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "# Load environment variables from a .envrc file\n",
+    "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -117,23 +149,26 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[0, 1, 2]\n",
+      "[2, 1, 0]\n",
       "Ensemble ranking based on multiple evaluations.\n",
-      "The ranking is based on the length and detail of the responses, with 'short answer.' being the least detailed (rank 0), 'a bit more detailed.' being moderate (rank 1), and 'the longest and most detailed answer.' being the most comprehensive (rank 2).\n",
-      "The ranking is based on the length and detail of the responses. The shortest response is ranked the lowest (0), the moderately detailed response is ranked higher (1), and the longest and most detailed response is ranked the highest (2).\n",
-      "Ranking is based on length and detail; the longest answer (2) is most detailed, followed by a bit more detailed (1), and the shortest answer (0) is the least detailed.\n"
+      "The ranking is based on the length and detail of each response. 'the longest and most detailed answer.' is the most comprehensive, followed by 'a bit more detailed.', and 'short answer.' is the briefest.\n",
+      "The ranking is based on the length and detail of each response. The response 'the longest and most detailed answer.' is ranked highest (2) because it is the most detailed, followed by 'a bit more detailed.' (1), and finally 'short answer.' (0) as it is the least detailed.\n",
+      "The responses are ranked based on the level of detail and length. 'short answer.' is the least detailed, 'a bit more detailed.' provides more information, and 'the longest and most detailed answer.' offers the most comprehensive explanation.\n"
      ]
     }
    ],
    "source": [
     "\n",
     "#| eval: false\n",
     "\n",
-    "from ragas_annotator.metric.llm import LLM\n",
+    "from ragas_annotator.llm import ragas_llm\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "llm = ragas_llm(provider=\"openai\",model=\"gpt-4o\",client=OpenAI())\n",
     "\n",
     "my_ranking_metric = RankingMetric(\n",
     "    name='response_ranking',\n",
-    "    llm=LLM(),  # Your language model instance\n",
+    "    llm=llm,  # Your language model instance\n",
     "    prompt=\"Rank the following responses:\\n{candidates}\",\n",
     "    num_ranks=3,\n",
     ")\n",
@@ -175,7 +210,7 @@
     "\n",
     "\n",
     "@ranking_metric(\n",
-    "    llm=LLM(),  # Your language model instance\n",
+    "    llm=llm,  # Your language model instance\n",
     "    prompt=\"Rank the following responses:\\n{candidates}\",\n",
     "    name='new_ranking_metric',\n",
     "    num_ranks=3\n",
diff --git a/ragas_annotator/metric/__init__.py b/ragas_annotator/metric/__init__.py
@@ -1,12 +1,10 @@
 from ragas_annotator.metric.result import MetricResult
-from ragas_annotator.metric.llm import LLM
 from ragas_annotator.metric.base import Metric
 from ragas_annotator.metric.discrete import DiscreteMetric
 from ragas_annotator.metric.numeric import NumericMetric
 from ragas_annotator.metric.ranking import RankingMetric
 
 __all__ = ['MetricResult',
-           'LLM',
            'Metric',
            'DiscreteMetric',
            'NumericMetric',
diff --git a/ragas_annotator/metric/base.py b/ragas_annotator/metric/base.py
@@ -12,14 +12,14 @@
 from pydantic import BaseModel
 import typing as t
 from . import MetricResult
-from . import LLM
+from ..llm import RagasLLM
 
 @dataclass
 class Metric(ABC):
     """Base class for all metrics in the LLM evaluation library."""
     name: str
     prompt: str
-    llm: LLM
+    llm: RagasLLM
     _response_models: t.Dict[bool, t.Type[BaseModel]] = field(
         default_factory=dict, init=False, repr=False
     )
diff --git a/ragas_annotator/metric/decorator.py b/ragas_annotator/metric/decorator.py