explodinggradients
diff --git a/‎src/ragas/integrations/__init__.py‎ b/‎src/ragas/integrations/__init__.py‎
diff --git a/‎src/ragas/integrations/langchain.py‎
Lines changed: 216 additions & 0 deletions b/‎src/ragas/integrations/langchain.py‎
Lines changed: 216 additions & 0 deletions
diff --git a/‎src/ragas/integrations/langsmith.py‎
Lines changed: 180 additions & 0 deletions b/‎src/ragas/integrations/langsmith.py‎
Lines changed: 180 additions & 0 deletions
@@ -0,0 +1,216 @@
+from __future__ import annotations
+
+import typing as t
+
+from langchain.chains.base import Chain
+from langchain.schema import RUN_KEY
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langsmith.evaluation import EvaluationResult, RunEvaluator
+from langsmith.schemas import Example, Run
+
+from ragas.embeddings import LangchainEmbeddingsWrapper
+from ragas.llms import LangchainLLMWrapper
+from ragas.metrics.base import (
+    EvaluationMode,
+    Metric,
+    MetricWithEmbeddings,
+    MetricWithLLM,
+    get_required_columns,
+)
+from ragas.run_config import RunConfig
+from ragas.validation import EVALMODE_TO_COLUMNS
+
+if t.TYPE_CHECKING:
+    from langchain.callbacks.manager import (
+        AsyncCallbackManagerForChainRun,
+        CallbackManagerForChainRun,
+    )
+
+
+class EvaluatorChain(Chain, RunEvaluator):
+    """
+    Wrapper around ragas Metrics to use them with langsmith.
+    """
+
+    metric: Metric
+
+    def __init__(self, metric: Metric, **kwargs: t.Any):
+        kwargs["metric"] = metric
+        super().__init__(**kwargs)
+        if "run_config" in kwargs:
+            run_config = kwargs["run_config"]
+        else:
+            run_config = RunConfig()
+        if isinstance(self.metric, MetricWithLLM):
+            llm = kwargs.get("llm", ChatOpenAI())
+            t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
+        if isinstance(self.metric, MetricWithEmbeddings):
+            embeddings = kwargs.get("embeddings", OpenAIEmbeddings())
+            t.cast(
+                MetricWithEmbeddings, self.metric
+            ).embeddings = LangchainEmbeddingsWrapper(embeddings)
+        self.metric.init(run_config)
+
+    @property
+    def input_keys(self) -> list[str]:
+        return get_required_columns(self.metric.evaluation_mode)
+
+    @property
+    def output_keys(self) -> list[str]:
+        return [self.metric.name]
+
+    def _call(
+        self,
+        inputs: dict[str, t.Any],
+        run_manager: t.Optional[CallbackManagerForChainRun] = None,
+    ) -> dict[str, t.Any]:
+        """
+        Call the evaluation chain.
+        """
+        self._validate(inputs)
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+
+        c = inputs.get("contexts", [""])
+        g = inputs.get("ground_truth", "")
+        q = inputs.get("question", "")
+        a = inputs.get("answer", "")
+        score = self.metric.score(
+            {
+                "question": q,
+                "answer": a,
+                "contexts": c,
+                "ground_truth": g,
+            },
+            callbacks=callbacks,
+        )
+        return {self.metric.name: score}
+
+    async def _acall(
+        self,
+        inputs: t.Dict[str, t.Any],
+        run_manager: t.Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> t.Dict[str, t.Any]:
+        """
+        Call the evaluation chain.
+        """
+        self._validate(inputs)
+        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
+        # TODO: currently AsyncCallbacks are not supported in ragas
+        _run_manager.get_child()
+
+        c = inputs.get("contexts", [""])
+        g = inputs.get("ground_truth", "")
+        q = inputs.get("question", "")
+        a = inputs.get("answer", "")
+        score = await self.metric.ascore(
+            {
+                "question": q,
+                "answer": a,
+                "contexts": c,
+                "ground_truth": g,
+            },
+            callbacks=[],
+        )
+        return {self.metric.name: score}
+
+    def _validate(
+        self,
+        input: dict[str, t.Any],
+        question_key: str = "question",
+        prediction_key: str = "answer",
+        context_key: str = "contexts",
+    ) -> None:
+        # validate each example
+        required_columns = EVALMODE_TO_COLUMNS[self.metric.evaluation_mode]
+        if "question" in required_columns and question_key not in input:
+            raise ValueError(
+                f'"{question_key}" is required in each example'
+                f"for the metric[{self.metric.name}] you have chosen."
+            )
+        if "answer" in required_columns and prediction_key not in input:
+            raise ValueError(
+                f'"{prediction_key}" is required in each prediction'
+                f"for the metric[{self.metric.name}] you have chosen."
+            )
+        if "contexts" in required_columns and context_key not in input:
+            raise ValueError(
+                f'"{context_key}" is required in each prediction for the '
+                f"metric[{self.metric.name}] you have chosen."
+            )
+        if "ground_truth" in required_columns and "ground_truth" not in input:
+            raise ValueError(
+                f'"ground_truth" is required in each prediction for the '
+                f"metric[{self.metric.name}] you have chosen."
+            )
+
+    @staticmethod
+    def _keys_are_present(keys_to_check: list, dict_to_check: dict) -> list[str]:
+        return [k for k in keys_to_check if k not in dict_to_check]
+
+    def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> None:
+        if example is None:
+            raise ValueError(
+                "expected example to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
+            )
+        if example.inputs is None:
+            raise ValueError(
+                "expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
+            )
+        if example.outputs is None:
+            raise ValueError(
+                "expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
+            )
+        if "question" not in example.inputs or "ground_truth" not in example.outputs:
+            raise ValueError(
+                "Expected 'question' and 'ground_truth' in example."
+                f"Got: {[k for k in example.inputs.keys()]}"
+            )
+        assert (
+            run.outputs is not None
+        ), "the current run has no outputs. The chain should output 'answer' and 'contexts' keys."
+        output_keys = get_required_columns(
+            self.metric.evaluation_mode, ["question", "ground_truth"]
+        )
+        missing_keys = self._keys_are_present(output_keys, run.outputs)
+        if missing_keys:
+            raise ValueError(
+                "Expected 'answer' and 'contexts' in run.outputs."
+                f"Got: {[k for k in run.outputs.keys()]}"
+            )
+
+    def evaluate_run(
+        self, run: Run, example: t.Optional[Example] = None
+    ) -> EvaluationResult:
+        """
+        Evaluate a langsmith run
+        """
+        self._validate_langsmith_eval(run, example)
+
+        # this is just to suppress the type checker error
+        # actual check and error message is in the _validate_langsmith_eval
+        assert run.outputs is not None
+        assert example is not None
+        assert example.inputs is not None
+        assert example.outputs is not None
+
+        chain_eval = run.outputs
+        chain_eval["question"] = example.inputs["question"]
+        if self.metric.evaluation_mode in [
+            EvaluationMode.gc,
+            EvaluationMode.ga,
+            EvaluationMode.qcg,
+            EvaluationMode.qga,
+        ]:
+            if example.outputs is None or "ground_truth" not in example.outputs:
+                raise ValueError("expected `ground_truth` in example outputs.")
+            chain_eval["ground_truth"] = example.outputs["ground_truth"]
+        eval_output = self(chain_eval, include_run_info=True)
+
+        evaluation_result = EvaluationResult(
+            key=self.metric.name, score=eval_output[self.metric.name]
+        )
+        if RUN_KEY in eval_output:
+            evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY]
+        return evaluation_result
@@ -0,0 +1,180 @@
+from __future__ import annotations
+
+import typing as t
+
+from langchain.smith import RunEvalConfig
+
+from ragas.integrations.langchain import EvaluatorChain
+
+if t.TYPE_CHECKING:
+    from langsmith.schemas import Dataset as LangsmithDataset
+
+    from ragas.testset.generator import TestDataset
+
+try:
+    from langsmith import Client
+    from langsmith.utils import LangSmithNotFoundError
+except ImportError:
+    raise ImportError(
+        "Please install langsmith to use this feature. You can install it via pip install langsmith"
+    )
+
+
+def upload_dataset(
+    dataset: TestDataset, dataset_name: str, dataset_desc: str = ""
+) -> LangsmithDataset:
+    """
+    Uploads a new dataset to LangSmith, converting it from a TestDataset object to a
+    pandas DataFrame before upload. If a dataset with the specified name already
+    exists, the function raises an error.
+
+    Parameters
+    ----------
+    dataset : TestDataset
+        The dataset to be uploaded.
+    dataset_name : str
+        The name for the new dataset in LangSmith.
+    dataset_desc : str, optional
+        A description for the new dataset. The default is an empty string.
+
+    Returns
+    -------
+    LangsmithDataset
+        The dataset object as stored in LangSmith after upload.
+
+    Raises
+    ------
+    ValueError
+        If a dataset with the specified name already exists in LangSmith.
+
+    Notes
+    -----
+    The function attempts to read a dataset by the given name to check its existence.
+    If not found, it proceeds to upload the dataset after converting it to a pandas
+    DataFrame. This involves specifying input and output keys for the dataset being
+    uploaded.
+    """
+    client = Client()
+    try:
+        # check if dataset exists
+        dataset = client.read_dataset(dataset_name=dataset_name)
+        raise ValueError(
+            f"Dataset {dataset_name} already exists in langsmith. [{dataset}]"
+        )
+    except LangSmithNotFoundError:
+        # if not create a new one with the generated query examples
+        dataset = client.upload_dataframe(
+            df=dataset.to_pandas(),
+            name=dataset_name,
+            input_keys=["question"],
+            output_keys=["ground_truth"],
+            description=dataset_desc,
+        )
+
+        print(
+            f"Created a new dataset '{dataset.name}'. Dataset is accessible at {dataset.url}"
+        )
+        return dataset
+
+
+def evaluate(
+    dataset_name: str,
+    llm_or_chain_factory: t.Any,
+    experiment_name: t.Optional[str] = None,
+    metrics: t.Optional[list] = None,
+    verbose: bool = False,
+) -> t.Dict[str, t.Any]:
+    """
+    Evaluates a language model or a chain factory on a specified dataset using
+    LangSmith, with the option to customize metrics and verbosity.
+
+    Parameters
+    ----------
+    dataset_name : str
+        The name of the dataset to use for evaluation. This dataset must exist in
+        LangSmith.
+    llm_or_chain_factory : Any
+        The language model or chain factory to be evaluated. This parameter is
+        flexible and can accept a variety of objects depending on the implementation.
+    experiment_name : Optional[str], optional
+        The name of the experiment. This can be used to categorize or identify the
+        evaluation run within LangSmith. The default is None.
+    metrics : Optional[list], optional
+        A list of custom metrics (functions or evaluators) to be used for the
+        evaluation. If None, a default set of metrics (answer relevancy, context
+        precision, context recall, and faithfulness) are used.
+        The default is None.
+    verbose : bool, optional
+        If True, detailed progress and results will be printed during the evaluation
+        process.
+        The default is False.
+
+    Returns
+    -------
+    Dict[str, Any]
+        A dictionary containing the results of the evaluation.
+
+    Raises
+    ------
+    ValueError
+        If the specified dataset does not exist in LangSmith.
+
+    See Also
+    --------
+    Client.read_dataset : Method to read an existing dataset.
+    Client.run_on_dataset : Method to run the evaluation on the specified dataset.
+
+    Examples
+    --------
+    >>> results = evaluate(
+    ...     dataset_name="MyDataset",
+    ...     llm_or_chain_factory=my_llm,
+    ...     experiment_name="experiment_1_with_vanila_rag",
+    ...     verbose=True
+    ... )
+    >>> print(results)
+    {'evaluation_result': ...}
+
+    Notes
+    -----
+    The function initializes a client to interact with LangSmith, validates the existence
+    of the specified dataset, prepares evaluation metrics, and runs the evaluation,
+    returning the results. Custom evaluation metrics can be specified, or a default set
+    will be used if none are provided.
+    """
+    # init client and validate dataset
+    client = Client()
+    try:
+        _ = client.read_dataset(dataset_name=dataset_name)
+    except LangSmithNotFoundError:
+        raise ValueError(
+            f"Dataset {dataset_name} not found in langsmith, make sure it exists in langsmith"
+        )
+
+    # make config
+    if metrics is None:
+        from ragas.metrics import (
+            answer_relevancy,
+            context_precision,
+            context_recall,
+            faithfulness,
+        )
+
+        metrics = [answer_relevancy, context_precision, faithfulness, context_recall]
+
+    metrics = [EvaluatorChain(m) for m in metrics]
+    eval_config = RunEvalConfig(
+        custom_evaluators=metrics,
+    )
+
+    # run evaluation with langsmith
+    run = client.run_on_dataset(
+        dataset_name=dataset_name,
+        llm_or_chain_factory=llm_or_chain_factory,
+        evaluation=eval_config,
+        verbose=verbose,
+        # Any experiment metadata can be specified here
+        project_name=experiment_name,
+    )
+
+    return run