feat: genetic algo based optimizer (#1724)

shahules786 · web-flow · commit 8307f66d6b25 · 2024-12-08T13:04:05.000+05:30
```python metric =AspectCritic(name="answer_correctness",definition="Given the user_input, reference and response. Is the response correct compared with the reference",llm=llm_4o) metric.train("alignment_sample.json") ``` [dummy data](https://github.com/user-attachments/files/17997460/alignment_sample.json)
diff --git a/src/ragas/callbacks.py b/src/ragas/callbacks.py
@@ -133,12 +133,15 @@ def __str__(self):
 
 def parse_run_traces(
     traces: t.Dict[str, ChainRun],
+    parent_run_id: t.Optional[str] = None,
 ) -> t.List[t.Dict[str, t.Any]]:
+
     root_traces = [
         chain_trace
         for chain_trace in traces.values()
-        if chain_trace.parent_run_id is None
+        if chain_trace.parent_run_id == parent_run_id
     ]
+
     if len(root_traces) > 1:
         raise ValueError(
             "Multiple root traces found! This is a bug on our end, please file an issue and we will fix it ASAP :)"
@@ -159,7 +162,7 @@ def parse_run_traces(
             prompt_traces = {}
             for i, prompt_uuid in enumerate(metric_trace.children):
                 prompt_trace = traces[prompt_uuid]
-                prompt_traces[f"{i}_{prompt_trace.name}"] = {
+                prompt_traces[f"{prompt_trace.name}"] = {
                     "input": prompt_trace.inputs.get("data", {}),
                     "output": prompt_trace.outputs.get("output", {}),
                 }
diff --git a/src/ragas/config.py b/src/ragas/config.py
@@ -5,7 +5,7 @@
 from ragas.embeddings import BaseRagasEmbeddings
 from ragas.llms import BaseRagasLLM
 from ragas.losses import Loss
-from ragas.optimizers import Optimizer
+from ragas.optimizers import GeneticOptimizer, Optimizer
 
 DEFAULT_OPTIMIZER_CONFIG = {"max_steps": 100}
 
@@ -20,7 +20,7 @@ class DemonstrationConfig(BaseModel):
 class InstructionConfig(BaseModel):
     enabled: bool = True
     loss: t.Optional[Loss] = None
-    optimizer: Optimizer
+    optimizer: Optimizer = GeneticOptimizer()
     optimizer_config: t.Dict[str, t.Any] = Field(
         default_factory=lambda: DEFAULT_OPTIMIZER_CONFIG
     )
diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py
@@ -6,6 +6,7 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass, field
+from uuid import UUID
 
 import numpy as np
 from datasets import Dataset as HFDataset
@@ -43,6 +44,13 @@ def get_features(self) -> t.List[str]:
         """
         return list(self.to_dict().keys())
 
+    def to_string(self) -> str:
+        """
+        Get the string representation of the sample.
+        """
+        sample_dict = self.to_dict()
+        return "".join(f"\n{key}:\n\t{val}\n" for key, val in sample_dict.items())
+
 
 class SingleTurnSample(BaseSample):
     """
@@ -378,6 +386,7 @@ class EvaluationResult:
     cost_cb: t.Optional[CostCallbackHandler] = None
     traces: t.List[t.Dict[str, t.Any]] = field(default_factory=list)
     ragas_traces: t.Dict[str, ChainRun] = field(default_factory=dict, repr=False)
+    run_id: t.Optional[UUID] = None
 
     def __post_init__(self):
         # transform scores from list of dicts to dict of lists
@@ -395,7 +404,8 @@ def __post_init__(self):
                 values.append(value + 1e-10)
 
         # parse the traces
-        self.traces = parse_run_traces(self.ragas_traces)
+        run_id = str(self.run_id) if self.run_id is not None else None
+        self.traces = parse_run_traces(self.ragas_traces, run_id)
 
     def __repr__(self) -> str:
         score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()]
@@ -531,7 +541,6 @@ def upload(self, base_url: str = RAGAS_API_URL, verbose: bool = True) -> str:
         return evaluation_endpoint
 
 
-
 class PromptAnnotation(BaseModel):
     prompt_input: t.Dict[str, t.Any]
     prompt_output: t.Dict[str, t.Any]
diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
 import typing as t
+from uuid import UUID
 
 from datasets import Dataset
 from langchain_core.callbacks import BaseCallbackHandler, BaseCallbackManager
 from langchain_core.embeddings import Embeddings as LangchainEmbeddings
 from langchain_core.language_models import BaseLanguageModel as LangchainLLM
+from tqdm.auto import tqdm
 
 from ragas._analytics import track_was_completed
 from ragas.callbacks import ChainType, RagasTracer, new_group
@@ -59,12 +61,14 @@ def evaluate(
     embeddings: t.Optional[BaseRagasEmbeddings | LangchainEmbeddings] = None,
     callbacks: Callbacks = None,
     in_ci: bool = False,
-    run_config: RunConfig = RunConfig(),
+    run_config: t.Optional[RunConfig] = None,
     token_usage_parser: t.Optional[TokenUsageParser] = None,
     raise_exceptions: bool = False,
     column_map: t.Optional[t.Dict[str, str]] = None,
     show_progress: bool = True,
     batch_size: t.Optional[int] = None,
+    _run_id: t.Optional[UUID] = None,
+    _pbar: t.Optional[tqdm] = None,
 ) -> EvaluationResult:
     """
     Run the evaluation on the dataset with different metrics
@@ -146,6 +150,7 @@ def evaluate(
     """
     column_map = column_map or {}
     callbacks = callbacks or []
+    run_config = run_config or RunConfig()
 
     if helicone_config.is_enabled:
         import uuid
@@ -226,6 +231,7 @@ def evaluate(
         run_config=run_config,
         show_progress=show_progress,
         batch_size=batch_size,
+        pbar=_pbar,
     )
 
     # Ragas Callbacks
@@ -333,6 +339,7 @@ def evaluate(
                 cost_cb,
             ),
             ragas_traces=tracer.traces,
+            run_id=_run_id,
         )
         if not evaluation_group_cm.ended:
             evaluation_rm.on_chain_end({"scores": result.scores})
diff --git a/src/ragas/executor.py b/src/ragas/executor.py
@@ -84,6 +84,7 @@ class Executor:
     batch_size: t.Optional[int] = None
     run_config: t.Optional[RunConfig] = field(default=None, repr=False)
     _nest_asyncio_applied: bool = field(default=False, repr=False)
+    pbar: t.Optional[tqdm] = None
 
     def wrap_callable_with_index(
         self, callable: t.Callable, counter: int
@@ -130,21 +131,22 @@ async def _process_jobs(self) -> t.List[t.Any]:
         results = []
 
         if not self.batch_size:
-            with tqdm(
-                total=len(self.jobs),
-                desc=self.desc,
-                disable=not self.show_progress,
-            ) as pbar:
-                # Create coroutines
-                coroutines = [
-                    afunc(*args, **kwargs) for afunc, args, kwargs, _ in self.jobs
-                ]
-                for future in await as_completed(coroutines, max_workers):
-                    result = await future
-                    results.append(result)
-                    pbar.update(1)
+            # Use external progress bar if provided, otherwise create one
+            if self.pbar is None:
+                with tqdm(
+                    total=len(self.jobs),
+                    desc=self.desc,
+                    disable=not self.show_progress,
+                ) as internal_pbar:
+                    await self._process_coroutines(
+                        self.jobs, internal_pbar, results, max_workers
+                    )
+            else:
+                await self._process_coroutines(
+                    self.jobs, self.pbar, results, max_workers
+                )
 
-                return results
+            return results
 
         # With batching, show nested progress bars
         batches = batched(self.jobs, self.batch_size)  # generator of job tuples
@@ -182,6 +184,14 @@ async def _process_jobs(self) -> t.List[t.Any]:
 
         return results
 
+    async def _process_coroutines(self, jobs, pbar, results, max_workers):
+        """Helper function to process coroutines and update the progress bar."""
+        coroutines = [afunc(*args, **kwargs) for afunc, args, kwargs, _ in jobs]
+        for future in await as_completed(coroutines, max_workers):
+            result = await future
+            results.append(result)
+            pbar.update(1)
+
     def results(self) -> t.List[t.Any]:
         """
         Execute all submitted jobs and return their results. The results are returned in the order of job submission.
diff --git a/src/ragas/losses.py b/src/ragas/losses.py
@@ -1,6 +1,9 @@
 import typing as t
 from abc import ABC, abstractmethod
 
+from pydantic import GetCoreSchemaHandler
+from pydantic_core import CoreSchema, core_schema
+
 
 class Loss(ABC):
     """
@@ -11,6 +14,17 @@ class Loss(ABC):
     def __call__(self, predicted: t.List, actual: t.List) -> float:
         raise NotImplementedError
 
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls, source_type: t.Any, handler: GetCoreSchemaHandler
+    ) -> CoreSchema:
+        """
+        Define how Pydantic generates a schema for BaseRagasEmbeddings.
+        """
+        return core_schema.no_info_after_validator_function(
+            cls, core_schema.is_instance_schema(cls)  # The validator function
+        )
+
 
 class MSELoss(Loss):
     """
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
@@ -12,8 +12,9 @@
 
 from ragas._analytics import EvaluationEvent, _analytics_batcher
 from ragas.callbacks import ChainType, new_group
-from ragas.dataset_schema import MultiTurnSample, SingleTurnSample
+from ragas.dataset_schema import MetricAnnotation, MultiTurnSample, SingleTurnSample
 from ragas.executor import is_event_loop_running
+from ragas.losses import BinaryMetricLoss, MSELoss
 from ragas.prompt import PromptMixin
 from ragas.run_config import RunConfig
 from ragas.utils import (
@@ -232,12 +233,77 @@ def init(self, run_config: RunConfig):
     def train(
         self,
         path: str,
-        demonstration_config: DemonstrationConfig,
-        instruction_config: InstructionConfig,
-        callbacks: Callbacks,
+        demonstration_config: t.Optional[DemonstrationConfig] = None,
+        instruction_config: t.Optional[InstructionConfig] = None,
+        callbacks: t.Optional[Callbacks] = None,
+        run_config: t.Optional[RunConfig] = None,
+        batch_size: t.Optional[int] = None,
+        with_debugging_logs=False,
+        raise_exceptions: bool = True,
     ) -> None:
 
-        raise NotImplementedError("Training is not implemented for this metric.")
+        if not path.endswith(".json"):
+            raise ValueError("Train data must be in json format")
+
+        if instruction_config is None:
+            from ragas.config import InstructionConfig
+
+            instruction_config = InstructionConfig()
+
+        if demonstration_config is None:
+            from ragas.config import DemonstrationConfig
+
+            demonstration_config = DemonstrationConfig()
+
+        dataset = MetricAnnotation.from_json(path, metric_name=self.name)
+
+        optimizer = instruction_config.optimizer
+        llm = instruction_config.llm or self.llm
+        if llm is None:
+            raise ValueError(
+                f"Metric '{self.name}' has no valid LLM provided (self.llm is None). Please initantiate a the metric with an LLM to run."  # noqa
+            )
+        if optimizer.llm is None:
+            optimizer.llm = llm
+
+        if instruction_config.loss is None:
+            if self.output_type is None:
+                raise ValueError(
+                    f"Output type for metric '{self.name}' is not defined. Please set the output type in the metric or in the instruction config."
+                )
+
+            if self.output_type.name == MetricOutputType.BINARY.name:
+                loss_fun = BinaryMetricLoss()
+            elif (
+                self.output_type.name == MetricOutputType.CONTINUOUS.name
+                or self.output_type.name == MetricOutputType.DISCRETE.name
+            ):
+                loss_fun = MSELoss()
+            else:
+                raise NotImplementedError(
+                    f"Output type '{self.output_type.name}' not implemented"
+                )
+        else:
+            loss_fun = instruction_config.loss
+
+        optimizer.metric = self
+
+        optimizer_config = instruction_config.optimizer_config or {}
+        optimized_prompts = optimizer.optimize(
+            dataset[self.name],
+            loss_fun,
+            optimizer_config,
+            callbacks=callbacks,
+            run_config=run_config,
+            batch_size=batch_size,
+            with_debugging_logs=with_debugging_logs,
+            raise_exceptions=raise_exceptions,
+        )
+        prompts = self.get_prompts()
+        for key, val in optimized_prompts.items():
+            prompts[key].instruction = val
+        self.set_prompts(**prompts)
+        return
 
 
 @dataclass
diff --git a/src/ragas/optimizers/__init__.py b/src/ragas/optimizers/__init__.py
@@ -1,3 +1,7 @@
-from .base import Optimizer
+from ragas.optimizers.base import Optimizer
+from ragas.optimizers.genetic import GeneticOptimizer
 
-__all__ = ["Optimizer"]
+__all__ = [
+    "Optimizer",
+    "GeneticOptimizer",
+]
diff --git a/src/ragas/optimizers/base.py b/src/ragas/optimizers/base.py
@@ -49,4 +49,4 @@ def optimize(
         Dict[str, str]
             The optimized prompts for given chain.
         """
-        pass
+        raise NotImplementedError("The method `optimize` must be implemented.")
diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py
diff --git a/src/ragas/optimizers/utils.py b/src/ragas/optimizers/utils.py