explodinggradients
diff --git a/‎nbs/embedding/base.ipynb‎
Lines changed: 1150 additions & 0 deletions b/‎nbs/embedding/base.ipynb‎
Lines changed: 1150 additions & 0 deletions
diff --git a/‎nbs/metric/base.ipynb‎
Lines changed: 54 additions & 4 deletions b/‎nbs/metric/base.ipynb‎
Lines changed: 54 additions & 4 deletions
diff --git a/‎nbs/metric/decorator.ipynb‎
Lines changed: 9 additions & 16 deletions b/‎nbs/metric/decorator.ipynb‎
Lines changed: 9 additions & 16 deletions
diff --git a/‎nbs/metric/discrete.ipynb‎
Lines changed: 21 additions & 16 deletions b/‎nbs/metric/discrete.ipynb‎
Lines changed: 21 additions & 16 deletions
diff --git a/‎nbs/metric/numeric.ipynb‎
Lines changed: 5 additions & 1 deletion b/‎nbs/metric/numeric.ipynb‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎nbs/metric/ranking.ipynb‎
Lines changed: 2 additions & 1 deletion b/‎nbs/metric/ranking.ipynb‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nbs/metric/result.ipynb‎
Lines changed: 6 additions & 1 deletion b/‎nbs/metric/result.ipynb‎
Lines changed: 6 additions & 1 deletion
@@ -42,19 +42,32 @@
     "from dataclasses import dataclass, field\n",
     "from pydantic import BaseModel\n",
     "import typing as t\n",
+    "import json\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "from ragas_annotator.prompt.base import Prompt\n",
+    "from ragas_annotator.embedding.base import BaseEmbedding\n",
     "from ragas_annotator.metric import MetricResult\n",
     "from ragas_annotator.llm import RagasLLM\n",
+    "from ragas_annotator.project.core import Project\n",
+    "from ragas_annotator.model.notion_model import NotionModel\n",
+    "from ragas_annotator.prompt.dynamic_few_shot import DynamicFewShotPrompt\n",
+    "\n",
     "\n",
     "@dataclass\n",
     "class Metric(ABC):\n",
     "    \"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\n",
     "    name: str\n",
-    "    prompt: str\n",
+    "    prompt: str | Prompt\n",
     "    llm: RagasLLM\n",
     "    _response_models: t.Dict[bool, t.Type[BaseModel]] = field(\n",
     "        default_factory=dict, init=False, repr=False\n",
     "    )\n",
     "    \n",
+    "    def __post_init__(self):\n",
+    "        if isinstance(self.prompt,str):\n",
+    "            self.prompt = Prompt(self.prompt)\n",
+    "    \n",
     "    @abstractmethod\n",
     "    def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n",
     "        \"\"\"Get the appropriate response model.\"\"\"\n",
@@ -67,22 +80,32 @@
     "    \n",
     "    def score(self, reasoning: bool = True, n: int = 1, **kwargs) -> t.Any:\n",
     "        responses = []\n",
+    "        traces = {}\n",
+    "        traces[\"input\"] = kwargs\n",
     "        prompt_input = self.prompt.format(**kwargs)\n",
     "        for _ in range(n):\n",
     "            response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning)) \n",
+    "            traces['output'] = response.model_dump()\n",
     "            response = MetricResult(**response.model_dump())\n",
     "            responses.append(response)\n",
-    "        return self._ensemble(responses)\n",
+    "        results = self._ensemble(responses)\n",
+    "        results.traces = traces\n",
+    "        return results\n",
     "\n",
     "\n",
     "    async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs) -> MetricResult:\n",
     "        responses = []  # Added missing initialization\n",
+    "        traces = {}\n",
+    "        traces[\"input\"] = kwargs\n",
     "        prompt_input = self.prompt.format(**kwargs)\n",
     "        for _ in range(n):\n",
     "            response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n",
+    "            traces['output'] = response.model_dump()\n",
     "            response = MetricResult(**response.model_dump())  # Fixed missing parentheses\n",
     "            responses.append(response)\n",
-    "        return self._ensemble(responses)\n",
+    "        results = self._ensemble(responses)\n",
+    "        results.traces = traces\n",
+    "        return results\n",
     "        \n",
     "    def batch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[t.Any]:\n",
     "        return [self.score(reasoning, n, **input_dict) for input_dict in inputs]\n",
@@ -94,7 +117,34 @@
     "            async_tasks.append(self.ascore(reasoning=reasoning, n=n, **input_dict))\n",
     "            \n",
     "        # Run all tasks concurrently and return results\n",
-    "        return await asyncio.gather(*async_tasks)"
+    "        return await asyncio.gather(*async_tasks)\n",
+    "    \n",
+    "    def train(self,project:Project, experiment_names: t.List[str], model:NotionModel, embedding_model: BaseEmbedding,method: t.Dict[str, t.Any]):\n",
+    "        \n",
+    "        assert isinstance(self.prompt, Prompt)\n",
+    "        self.prompt = DynamicFewShotPrompt.from_prompt(self.prompt,embedding_model)\n",
+    "        datasets = []\n",
+    "        for experiment_name in experiment_names:\n",
+    "            experiment_data = project.get_experiment(experiment_name,model)\n",
+    "            experiment_data.load()\n",
+    "            datasets.append(experiment_data)\n",
+    "        \n",
+    "        total_items = sum([len(dataset) for dataset in datasets])\n",
+    "        with tqdm(total=total_items, desc=\"Processing examples\") as pbar:\n",
+    "            for dataset in datasets:\n",
+    "                for row in dataset:\n",
+    "                    if hasattr(row, f'{self.name}_traces'):\n",
+    "                        traces = json.loads(getattr(row, f'{self.name}_traces'))\n",
+    "                        if traces:\n",
+    "                            self.prompt.add_example(traces['input'],traces['output'])\n",
+    "                    pbar.update(1)\n",
+    "        \n",
+    "                \n",
+    "                \n",
+    "        \n",
+    "        \n",
+    "        \n",
+    "                "
    ]
   },
   {
 
@@ -31,6 +31,7 @@
     "from dataclasses import dataclass\n",
     "from ragas_annotator.metric import MetricResult\n",
     "from ragas_annotator.llm import RagasLLM\n",
+    "from ragas_annotator.prompt.base import Prompt\n",
     "\n",
     "\n",
     "\n",
@@ -45,7 +46,7 @@
     "    Returns:\n",
     "        A decorator factory function for the specified metric type\n",
     "    \"\"\"\n",
-    "    def decorator_factory(llm:RagasLLM, prompt, name: t.Optional[str] = None, **metric_params):\n",
+    "    def decorator_factory(llm:RagasLLM, prompt: t.Union[str, Prompt], name: t.Optional[str] = None, **metric_params):\n",
     "        \"\"\"\n",
     "        Creates a decorator that wraps a function into a metric instance.\n",
     "        \n",
@@ -64,17 +65,9 @@
     "            metric_name = name or func.__name__\n",
     "            is_async = inspect.iscoroutinefunction(func)\n",
     "            \n",
+    "            #TODO: Move to dataclass type implementation\n",
     "            @dataclass\n",
     "            class CustomMetric(metric_class):\n",
-    "                def _extract_result(self, result, reasoning: bool):\n",
-    "                    \"\"\"Extract score and reason from the result.\"\"\"\n",
-    "                    if isinstance(result, tuple) and len(result) == 2:\n",
-    "                        score, reason = result\n",
-    "                    else:\n",
-    "                        score, reason = result, None\n",
-    "                    \n",
-    "                    # Use \"result\" instead of \"score\" for the new MetricResult implementation\n",
-    "                    return MetricResult(result=score, reason=reason if reasoning else None)\n",
     "                \n",
     "                def _run_sync_in_async(self, func, *args, **kwargs):\n",
     "                    \"\"\"Run a synchronous function in an async context.\"\"\"\n",
@@ -101,7 +94,7 @@
     "                            # Sync function implementation\n",
     "                            result = func(self.llm, self.prompt, **kwargs)\n",
     "                        \n",
-    "                        return self._extract_result(result, reasoning)\n",
+    "                        return result\n",
     "                    except Exception as e:\n",
     "                        # Handle errors gracefully\n",
     "                        error_msg = f\"Error executing metric {self.name}: {str(e)}\"\n",
@@ -120,7 +113,7 @@
     "                    else:\n",
     "                        # For sync functions, run normally\n",
     "                        result = self._run_sync_in_async(func, self.llm, self.prompt, **kwargs)\n",
-    "                        return self._extract_result(result, reasoning)\n",
+    "                        return result\n",
     "            \n",
     "            # Create the metric instance with all parameters\n",
     "            metric_instance = CustomMetric(\n",
@@ -159,16 +152,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "high\n",
-      "reason\n"
+      "low\n",
+      "The context or details of the user's response ('my response') are not provided, making it impossible to evaluate its helpfulness accurately.\n"
      ]
     }
    ],
    "source": [
     "#| eval: false\n",
     "\n",
     "\n",
-    "from ragas_annotator.metric import DiscreteMetric\n",
+    "from ragas_annotator.metric import DiscreteMetric, MetricResult\n",
     "from pydantic import BaseModel\n",
     "\n",
     "from ragas_annotator.llm import ragas_llm\n",
@@ -193,7 +186,7 @@
     "            score = 'low'\n",
     "        else:\n",
     "            score = 'high'\n",
-    "        return score,\"reason\"\n",
+    "        return MetricResult(result=score, reason=response.reason)\n",
     "\n",
     "result = my_metric.score(response='my response') # result\n",
     "print(result)\n",
 
@@ -21,16 +21,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "#| export\n",
     "import typing as t\n",
@@ -99,8 +90,17 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "low\n",
-      "The response does not provide any specific information or context that can help evaluate its helpfulness.\n"
+      "med\n",
+      "The given input \"this is my response\" is too vague to provide a comprehensive evaluation.\n",
+      "\n",
+      "Positives:\n",
+      "1. Clear Statement: It's a straightforward indication that a response has been provided.\n",
+      "\n",
+      "Negatives:\n",
+      "1. Lack of Context: Without context or additional information, it's impossible to assess the relevance or accuracy of the response.\n",
+      "2. No Specificity: The response doesn't convey any specific information or insight related to a topic or question.\n",
+      "\n",
+      "If this response was intended to be part of a conversation or instruction, more detail would be required to make it highly effective. At present, it serves as a neutral statement without actionable or informative content.\n"
      ]
     }
    ],
@@ -143,13 +143,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "high\n",
-      "reason\n"
+      "low\n",
+      "The prompt 'my response' does not provide sufficient information or context for me to evaluate its helpfulness. An answer needs to be specific and provide insight or information relative to a clear question or context.\n"
      ]
     }
    ],
    "source": [
     "#| eval: false\n",
+    "from ragas_annotator.metric.result import MetricResult\n",
+    "\n",
     "@discrete_metric(llm=llm,\n",
     "    prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
     "    name='new_metric',values=[\"low\",\"med\",\"high\"])\n",
@@ -158,14 +160,17 @@
     "        class response_model(BaseModel):\n",
     "             output: t.List[bool]\n",
     "             reason: str\n",
-    "        \n",
+    "        traces = {}\n",
+    "        traces['input'] = kwargs\n",
     "        response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n",
+    "        traces['output'] = response.model_dump()\n",
     "        total = sum(response.output)\n",
     "        if total < 1:\n",
     "            score = 'low'\n",
     "        else:\n",
     "            score = 'high'\n",
-    "        return score,\"reason\"\n",
+    "            \n",
+    "        return MetricResult(result=score,reason=response.reason,traces=traces)\n",
     "\n",
     "result = my_metric.score(response='my response') # result\n",
     "print(result)\n",
 
@@ -147,6 +147,7 @@
    "source": [
     "\n",
     "#| eval: false\n",
+    "from ragas_annotator.metric import MetricResult\n",
     "\n",
     "@numeric_metric(llm=llm,\n",
     "    prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n",
@@ -157,13 +158,16 @@
     "             output: int\n",
     "             reason: str\n",
     "        \n",
+    "        traces = {}\n",
+    "        traces['input'] = kwargs\n",
     "        response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n",
+    "        traces['output'] = response.dict()\n",
     "        total = response.output\n",
     "        if total < 1:\n",
     "            score = 0\n",
     "        else:\n",
     "            score = 10\n",
-    "        return score,\"reason\"\n",
+    "        return MetricResult(result=score,reason=response.reason,traces=traces)\n",
     "\n",
     "result = my_metric.score(response='my response') # result\n",
     "result # 10\n",
 
@@ -185,6 +185,7 @@
    "source": [
     "#| eval: false\n",
     "\n",
+    "from ragas_annotator.metric import MetricResult\n",
     "\n",
     "@ranking_metric(\n",
     "    llm=llm,  # Your language model instance\n",
@@ -197,7 +198,7 @@
     "    # For example, process the prompt (formatted with candidates) and produce a ranking.\n",
     "    ranking = [1, 0, 2]  # Dummy ranking: second candidate is best, then first, then third.\n",
     "    reason = \"Ranked based on response clarity and detail.\"\n",
-    "    return ranking, reason\n",
+    "    return MetricResult(result=ranking, reason=reason)\n",
     "\n",
     "# Using the decorator-based ranking metric:\n",
     "result = my_ranking_metric.score(candidates=[\n",
 
@@ -46,9 +46,14 @@
     "    - RankingMetrics (list results)\n",
     "    \"\"\"\n",
     "    \n",
-    "    def __init__(self, result: t.Any, reason: t.Optional[str] = None):\n",
+    "    def __init__(self, result: t.Any, reason: t.Optional[str] = None, traces: t.Optional[t.Dict[str, t.Any]] = None):\n",
+    "        if traces is not None:\n",
+    "            invalid_keys = [key for key in traces.keys() if key not in {\"input\", \"output\"}]\n",
+    "            if invalid_keys:\n",
+    "                raise ValueError(f\"Invalid keys in traces: {invalid_keys}. Allowed keys are 'input' and 'output'.\")\n",
     "        self._result = result\n",
     "        self.reason = reason\n",
+    "        self.traces = traces\n",
     "    \n",
     "    def __repr__(self):\n",
     "        return repr(self._result)\n",