Skip to content

Commit 6d38222

Browse files
jjmachanmspronesti
andauthored
feat(integrations): add support for langchain and langsmith (#795)
Co-authored-by: Massimiliano Pronesti <[email protected]>
1 parent 962d40d commit 6d38222

File tree

5 files changed

+422
-3
lines changed

5 files changed

+422
-3
lines changed

src/ragas/integrations/__init__.py

Whitespace-only changes.
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
from __future__ import annotations
2+
3+
import typing as t
4+
5+
from langchain.chains.base import Chain
6+
from langchain.schema import RUN_KEY
7+
from langchain_openai.chat_models import ChatOpenAI
8+
from langchain_openai.embeddings import OpenAIEmbeddings
9+
from langsmith.evaluation import EvaluationResult, RunEvaluator
10+
from langsmith.schemas import Example, Run
11+
12+
from ragas.embeddings import LangchainEmbeddingsWrapper
13+
from ragas.llms import LangchainLLMWrapper
14+
from ragas.metrics.base import (
15+
EvaluationMode,
16+
Metric,
17+
MetricWithEmbeddings,
18+
MetricWithLLM,
19+
get_required_columns,
20+
)
21+
from ragas.run_config import RunConfig
22+
from ragas.validation import EVALMODE_TO_COLUMNS
23+
24+
if t.TYPE_CHECKING:
25+
from langchain.callbacks.manager import (
26+
AsyncCallbackManagerForChainRun,
27+
CallbackManagerForChainRun,
28+
)
29+
30+
31+
class EvaluatorChain(Chain, RunEvaluator):
32+
"""
33+
Wrapper around ragas Metrics to use them with langsmith.
34+
"""
35+
36+
metric: Metric
37+
38+
def __init__(self, metric: Metric, **kwargs: t.Any):
39+
kwargs["metric"] = metric
40+
super().__init__(**kwargs)
41+
if "run_config" in kwargs:
42+
run_config = kwargs["run_config"]
43+
else:
44+
run_config = RunConfig()
45+
if isinstance(self.metric, MetricWithLLM):
46+
llm = kwargs.get("llm", ChatOpenAI())
47+
t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm)
48+
if isinstance(self.metric, MetricWithEmbeddings):
49+
embeddings = kwargs.get("embeddings", OpenAIEmbeddings())
50+
t.cast(
51+
MetricWithEmbeddings, self.metric
52+
).embeddings = LangchainEmbeddingsWrapper(embeddings)
53+
self.metric.init(run_config)
54+
55+
@property
56+
def input_keys(self) -> list[str]:
57+
return get_required_columns(self.metric.evaluation_mode)
58+
59+
@property
60+
def output_keys(self) -> list[str]:
61+
return [self.metric.name]
62+
63+
def _call(
64+
self,
65+
inputs: dict[str, t.Any],
66+
run_manager: t.Optional[CallbackManagerForChainRun] = None,
67+
) -> dict[str, t.Any]:
68+
"""
69+
Call the evaluation chain.
70+
"""
71+
self._validate(inputs)
72+
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
73+
callbacks = _run_manager.get_child()
74+
75+
c = inputs.get("contexts", [""])
76+
g = inputs.get("ground_truth", "")
77+
q = inputs.get("question", "")
78+
a = inputs.get("answer", "")
79+
score = self.metric.score(
80+
{
81+
"question": q,
82+
"answer": a,
83+
"contexts": c,
84+
"ground_truth": g,
85+
},
86+
callbacks=callbacks,
87+
)
88+
return {self.metric.name: score}
89+
90+
async def _acall(
91+
self,
92+
inputs: t.Dict[str, t.Any],
93+
run_manager: t.Optional[AsyncCallbackManagerForChainRun] = None,
94+
) -> t.Dict[str, t.Any]:
95+
"""
96+
Call the evaluation chain.
97+
"""
98+
self._validate(inputs)
99+
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
100+
# TODO: currently AsyncCallbacks are not supported in ragas
101+
_run_manager.get_child()
102+
103+
c = inputs.get("contexts", [""])
104+
g = inputs.get("ground_truth", "")
105+
q = inputs.get("question", "")
106+
a = inputs.get("answer", "")
107+
score = await self.metric.ascore(
108+
{
109+
"question": q,
110+
"answer": a,
111+
"contexts": c,
112+
"ground_truth": g,
113+
},
114+
callbacks=[],
115+
)
116+
return {self.metric.name: score}
117+
118+
def _validate(
119+
self,
120+
input: dict[str, t.Any],
121+
question_key: str = "question",
122+
prediction_key: str = "answer",
123+
context_key: str = "contexts",
124+
) -> None:
125+
# validate each example
126+
required_columns = EVALMODE_TO_COLUMNS[self.metric.evaluation_mode]
127+
if "question" in required_columns and question_key not in input:
128+
raise ValueError(
129+
f'"{question_key}" is required in each example'
130+
f"for the metric[{self.metric.name}] you have chosen."
131+
)
132+
if "answer" in required_columns and prediction_key not in input:
133+
raise ValueError(
134+
f'"{prediction_key}" is required in each prediction'
135+
f"for the metric[{self.metric.name}] you have chosen."
136+
)
137+
if "contexts" in required_columns and context_key not in input:
138+
raise ValueError(
139+
f'"{context_key}" is required in each prediction for the '
140+
f"metric[{self.metric.name}] you have chosen."
141+
)
142+
if "ground_truth" in required_columns and "ground_truth" not in input:
143+
raise ValueError(
144+
f'"ground_truth" is required in each prediction for the '
145+
f"metric[{self.metric.name}] you have chosen."
146+
)
147+
148+
@staticmethod
149+
def _keys_are_present(keys_to_check: list, dict_to_check: dict) -> list[str]:
150+
return [k for k in keys_to_check if k not in dict_to_check]
151+
152+
def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> None:
153+
if example is None:
154+
raise ValueError(
155+
"expected example to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
156+
)
157+
if example.inputs is None:
158+
raise ValueError(
159+
"expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
160+
)
161+
if example.outputs is None:
162+
raise ValueError(
163+
"expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded."
164+
)
165+
if "question" not in example.inputs or "ground_truth" not in example.outputs:
166+
raise ValueError(
167+
"Expected 'question' and 'ground_truth' in example."
168+
f"Got: {[k for k in example.inputs.keys()]}"
169+
)
170+
assert (
171+
run.outputs is not None
172+
), "the current run has no outputs. The chain should output 'answer' and 'contexts' keys."
173+
output_keys = get_required_columns(
174+
self.metric.evaluation_mode, ["question", "ground_truth"]
175+
)
176+
missing_keys = self._keys_are_present(output_keys, run.outputs)
177+
if missing_keys:
178+
raise ValueError(
179+
"Expected 'answer' and 'contexts' in run.outputs."
180+
f"Got: {[k for k in run.outputs.keys()]}"
181+
)
182+
183+
def evaluate_run(
184+
self, run: Run, example: t.Optional[Example] = None
185+
) -> EvaluationResult:
186+
"""
187+
Evaluate a langsmith run
188+
"""
189+
self._validate_langsmith_eval(run, example)
190+
191+
# this is just to suppress the type checker error
192+
# actual check and error message is in the _validate_langsmith_eval
193+
assert run.outputs is not None
194+
assert example is not None
195+
assert example.inputs is not None
196+
assert example.outputs is not None
197+
198+
chain_eval = run.outputs
199+
chain_eval["question"] = example.inputs["question"]
200+
if self.metric.evaluation_mode in [
201+
EvaluationMode.gc,
202+
EvaluationMode.ga,
203+
EvaluationMode.qcg,
204+
EvaluationMode.qga,
205+
]:
206+
if example.outputs is None or "ground_truth" not in example.outputs:
207+
raise ValueError("expected `ground_truth` in example outputs.")
208+
chain_eval["ground_truth"] = example.outputs["ground_truth"]
209+
eval_output = self(chain_eval, include_run_info=True)
210+
211+
evaluation_result = EvaluationResult(
212+
key=self.metric.name, score=eval_output[self.metric.name]
213+
)
214+
if RUN_KEY in eval_output:
215+
evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY]
216+
return evaluation_result
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
from __future__ import annotations
2+
3+
import typing as t
4+
5+
from langchain.smith import RunEvalConfig
6+
7+
from ragas.integrations.langchain import EvaluatorChain
8+
9+
if t.TYPE_CHECKING:
10+
from langsmith.schemas import Dataset as LangsmithDataset
11+
12+
from ragas.testset.generator import TestDataset
13+
14+
try:
15+
from langsmith import Client
16+
from langsmith.utils import LangSmithNotFoundError
17+
except ImportError:
18+
raise ImportError(
19+
"Please install langsmith to use this feature. You can install it via pip install langsmith"
20+
)
21+
22+
23+
def upload_dataset(
24+
dataset: TestDataset, dataset_name: str, dataset_desc: str = ""
25+
) -> LangsmithDataset:
26+
"""
27+
Uploads a new dataset to LangSmith, converting it from a TestDataset object to a
28+
pandas DataFrame before upload. If a dataset with the specified name already
29+
exists, the function raises an error.
30+
31+
Parameters
32+
----------
33+
dataset : TestDataset
34+
The dataset to be uploaded.
35+
dataset_name : str
36+
The name for the new dataset in LangSmith.
37+
dataset_desc : str, optional
38+
A description for the new dataset. The default is an empty string.
39+
40+
Returns
41+
-------
42+
LangsmithDataset
43+
The dataset object as stored in LangSmith after upload.
44+
45+
Raises
46+
------
47+
ValueError
48+
If a dataset with the specified name already exists in LangSmith.
49+
50+
Notes
51+
-----
52+
The function attempts to read a dataset by the given name to check its existence.
53+
If not found, it proceeds to upload the dataset after converting it to a pandas
54+
DataFrame. This involves specifying input and output keys for the dataset being
55+
uploaded.
56+
"""
57+
client = Client()
58+
try:
59+
# check if dataset exists
60+
dataset = client.read_dataset(dataset_name=dataset_name)
61+
raise ValueError(
62+
f"Dataset {dataset_name} already exists in langsmith. [{dataset}]"
63+
)
64+
except LangSmithNotFoundError:
65+
# if not create a new one with the generated query examples
66+
dataset = client.upload_dataframe(
67+
df=dataset.to_pandas(),
68+
name=dataset_name,
69+
input_keys=["question"],
70+
output_keys=["ground_truth"],
71+
description=dataset_desc,
72+
)
73+
74+
print(
75+
f"Created a new dataset '{dataset.name}'. Dataset is accessible at {dataset.url}"
76+
)
77+
return dataset
78+
79+
80+
def evaluate(
81+
dataset_name: str,
82+
llm_or_chain_factory: t.Any,
83+
experiment_name: t.Optional[str] = None,
84+
metrics: t.Optional[list] = None,
85+
verbose: bool = False,
86+
) -> t.Dict[str, t.Any]:
87+
"""
88+
Evaluates a language model or a chain factory on a specified dataset using
89+
LangSmith, with the option to customize metrics and verbosity.
90+
91+
Parameters
92+
----------
93+
dataset_name : str
94+
The name of the dataset to use for evaluation. This dataset must exist in
95+
LangSmith.
96+
llm_or_chain_factory : Any
97+
The language model or chain factory to be evaluated. This parameter is
98+
flexible and can accept a variety of objects depending on the implementation.
99+
experiment_name : Optional[str], optional
100+
The name of the experiment. This can be used to categorize or identify the
101+
evaluation run within LangSmith. The default is None.
102+
metrics : Optional[list], optional
103+
A list of custom metrics (functions or evaluators) to be used for the
104+
evaluation. If None, a default set of metrics (answer relevancy, context
105+
precision, context recall, and faithfulness) are used.
106+
The default is None.
107+
verbose : bool, optional
108+
If True, detailed progress and results will be printed during the evaluation
109+
process.
110+
The default is False.
111+
112+
Returns
113+
-------
114+
Dict[str, Any]
115+
A dictionary containing the results of the evaluation.
116+
117+
Raises
118+
------
119+
ValueError
120+
If the specified dataset does not exist in LangSmith.
121+
122+
See Also
123+
--------
124+
Client.read_dataset : Method to read an existing dataset.
125+
Client.run_on_dataset : Method to run the evaluation on the specified dataset.
126+
127+
Examples
128+
--------
129+
>>> results = evaluate(
130+
... dataset_name="MyDataset",
131+
... llm_or_chain_factory=my_llm,
132+
... experiment_name="experiment_1_with_vanila_rag",
133+
... verbose=True
134+
... )
135+
>>> print(results)
136+
{'evaluation_result': ...}
137+
138+
Notes
139+
-----
140+
The function initializes a client to interact with LangSmith, validates the existence
141+
of the specified dataset, prepares evaluation metrics, and runs the evaluation,
142+
returning the results. Custom evaluation metrics can be specified, or a default set
143+
will be used if none are provided.
144+
"""
145+
# init client and validate dataset
146+
client = Client()
147+
try:
148+
_ = client.read_dataset(dataset_name=dataset_name)
149+
except LangSmithNotFoundError:
150+
raise ValueError(
151+
f"Dataset {dataset_name} not found in langsmith, make sure it exists in langsmith"
152+
)
153+
154+
# make config
155+
if metrics is None:
156+
from ragas.metrics import (
157+
answer_relevancy,
158+
context_precision,
159+
context_recall,
160+
faithfulness,
161+
)
162+
163+
metrics = [answer_relevancy, context_precision, faithfulness, context_recall]
164+
165+
metrics = [EvaluatorChain(m) for m in metrics]
166+
eval_config = RunEvalConfig(
167+
custom_evaluators=metrics,
168+
)
169+
170+
# run evaluation with langsmith
171+
run = client.run_on_dataset(
172+
dataset_name=dataset_name,
173+
llm_or_chain_factory=llm_or_chain_factory,
174+
evaluation=eval_config,
175+
verbose=verbose,
176+
# Any experiment metadata can be specified here
177+
project_name=experiment_name,
178+
)
179+
180+
return run

0 commit comments

Comments
 (0)