Skip to content

Commit c40891b

Browse files
jverreanmorgan24
andauthored
Update Opik notebook example and integration (#1295)
# Opik integration update Updated the Opik integration: 1. Updated notebooks following the changes to expected inputs for metric functions 2. Updated the OpikTracer following some changes in the Opik library --------- Co-authored-by: Abby Morgan <[email protected]>
1 parent 78b54c6 commit c40891b

File tree

3 files changed

+68
-59
lines changed

3 files changed

+68
-59
lines changed

docs/howtos/applications/tracing.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ export LANGCHAIN_API_KEY=<your-api-key>
1212
export LANGCHAIN_PROJECT=<your-project> # if not specified, defaults to "default"
1313
```
1414

15-
Now we have to import the required tracer from langchain, here we are using `LangChainTracer` but you can similarly use any tracer supported by langchain like [WandbTracer](https://python.langchain.com/docs/integrations/providers/wandb_tracing) or [OpikTracer](https://comet.com/docs/opik/tracing/integrations/ragas?utm_source=ragas&utm_medium=github&utm_campaign=opik&utm_content=tracing_how_to)
15+
Now we have to import the required tracer from langchain, here we are using `LangChainTracer` but you can similarly use any tracer supported by langchain like [WandbTracer](https://python.langchain.com/docs/integrations/providers/wandb_tracing) or [OpikTracer](https://comet.com/docs/opik/tracing/integrations/ragas?utm_source=ragas&utm_medium=docs&utm_campaign=opik&utm_content=tracing_how_to)
1616

1717
```{code-block} python
1818
# langsmith

docs/howtos/integrations/opik.ipynb

Lines changed: 48 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# Opik by Comet\n",
7+
"# Comet Opik\n",
88
"\n",
99
"In this notebook, we will showcase how to use Opik with Ragas for monitoring and evaluation of RAG (Retrieval-Augmented Generation) pipelines.\n",
1010
"\n",
@@ -13,16 +13,18 @@
1313
"1. Using Ragas metrics to score traces\n",
1414
"2. Using the Ragas `evaluate` function to score a dataset\n",
1515
"\n",
16+
"<center><img src=\"https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-project-dashboard.png\" alt=\"Comet Opik project dashboard screenshot with list of traces and spans\" width=\"600\" style=\"border: 0.5px solid #ddd;\"/></center>\n",
17+
"\n",
1618
"## Setup\n",
1719
"\n",
18-
"[Comet](https://www.comet.com/site?utm_medium=github&utm_source=ragas&utm_campaign=opik) provides a hosted version of the Opik platform, [simply create an account](https://www.comet.com/signup?from=llm&utm_medium=github&utm_source=ragas&utm_campaign=opik) and grab you API Key.\n",
20+
"[Comet](https://www.comet.com/site?utm_medium=docs&utm_source=ragas&utm_campaign=opik) provides a hosted version of the Opik platform, [simply create an account](https://www.comet.com/signup?from=llm&utm_medium=docs&utm_source=ragas&utm_campaign=opik) and grab you API Key.\n",
1921
"\n",
20-
"> You can also run the Opik platform locally, see the [installation guide](https://www.comet.com/docs/opik/self-host/self_hosting_opik?utm_medium=github&utm_source=ragas&utm_campaign=opik/) for more information."
22+
"> You can also run the Opik platform locally, see the [installation guide](https://www.comet.com/docs/opik/self-host/self_hosting_opik?utm_medium=docs&utm_source=ragas&utm_campaign=opik/) for more information."
2123
]
2224
},
2325
{
2426
"cell_type": "code",
25-
"execution_count": null,
27+
"execution_count": 1,
2628
"metadata": {},
2729
"outputs": [],
2830
"source": [
@@ -44,7 +46,7 @@
4446
},
4547
{
4648
"cell_type": "code",
47-
"execution_count": 1,
49+
"execution_count": 2,
4850
"metadata": {},
4951
"outputs": [],
5052
"source": [
@@ -63,7 +65,7 @@
6365
},
6466
{
6567
"cell_type": "code",
66-
"execution_count": 1,
68+
"execution_count": 3,
6769
"metadata": {},
6870
"outputs": [],
6971
"source": [
@@ -97,7 +99,7 @@
9799
},
98100
{
99101
"cell_type": "code",
100-
"execution_count": 2,
102+
"execution_count": 4,
101103
"metadata": {},
102104
"outputs": [],
103105
"source": [
@@ -126,7 +128,7 @@
126128
},
127129
{
128130
"cell_type": "code",
129-
"execution_count": 3,
131+
"execution_count": 5,
130132
"metadata": {},
131133
"outputs": [],
132134
"source": [
@@ -138,43 +140,48 @@
138140
},
139141
{
140142
"cell_type": "code",
141-
"execution_count": 4,
143+
"execution_count": 6,
142144
"metadata": {},
143145
"outputs": [
144146
{
145147
"name": "stdout",
146148
"output_type": "stream",
147149
"text": [
148-
"Answer Relevancy score: 0.9616931041269692\n"
150+
"Answer Relevancy score: 1.0\n"
149151
]
150152
}
151153
],
152154
"source": [
153155
"import asyncio\n",
154156
"from ragas.integrations.opik import OpikTracer\n",
157+
"from ragas.dataset_schema import SingleTurnSample\n",
155158
"\n",
156159
"\n",
157160
"# Define the scoring function\n",
158-
"def compute_metric(opik_tracer, metric, row):\n",
161+
"def compute_metric(metric, row):\n",
162+
" row = SingleTurnSample(**row)\n",
163+
"\n",
164+
" opik_tracer = OpikTracer()\n",
165+
"\n",
159166
" async def get_score(opik_tracer, metric, row):\n",
160-
" score = await metric.ascore(row, callbacks=[opik_tracer])\n",
167+
" score = await metric.single_turn_ascore(row, callbacks=[OpikTracer()])\n",
161168
" return score\n",
162169
"\n",
163170
" # Run the async function using the current event loop\n",
164171
" loop = asyncio.get_event_loop()\n",
172+
"\n",
165173
" result = loop.run_until_complete(get_score(opik_tracer, metric, row))\n",
166174
" return result\n",
167175
"\n",
168176
"\n",
169177
"# Score a simple example\n",
170178
"row = {\n",
171-
" \"question\": \"What is the capital of France?\",\n",
172-
" \"answer\": \"Paris\",\n",
173-
" \"contexts\": [\"Paris is the capital of France.\", \"Paris is in France.\"],\n",
179+
" \"user_input\": \"What is the capital of France?\",\n",
180+
" \"response\": \"Paris\",\n",
181+
" \"retrieved_contexts\": [\"Paris is the capital of France.\", \"Paris is in France.\"],\n",
174182
"}\n",
175183
"\n",
176-
"opik_tracer = OpikTracer()\n",
177-
"score = compute_metric(opik_tracer, answer_relevancy_metric, row)\n",
184+
"score = compute_metric(answer_relevancy_metric, row)\n",
178185
"print(\"Answer Relevancy score:\", score)"
179186
]
180187
},
@@ -186,14 +193,14 @@
186193
"\n",
187194
"#### Score traces\n",
188195
"\n",
189-
"You can score traces by using the `get_current_trace` function to get the current trace and then calling the `log_feedback_score` function.\n",
196+
"You can score traces by using the `update_current_trace` function to get the current trace and passing the feedback scores to that function.\n",
190197
"\n",
191198
"The advantage of this approach is that the scoring span is added to the trace allowing for a more fine-grained analysis of the RAG pipeline. It will however run the Ragas metric calculation synchronously and so might not be suitable for production use-cases."
192199
]
193200
},
194201
{
195202
"cell_type": "code",
196-
"execution_count": 5,
203+
"execution_count": 7,
197204
"metadata": {},
198205
"outputs": [
199206
{
@@ -202,14 +209,14 @@
202209
"'Paris'"
203210
]
204211
},
205-
"execution_count": 5,
212+
"execution_count": 7,
206213
"metadata": {},
207214
"output_type": "execute_result"
208215
}
209216
],
210217
"source": [
211218
"from opik import track\n",
212-
"from opik.opik_context import get_current_trace\n",
219+
"from opik.opik_context import update_current_trace\n",
213220
"\n",
214221
"\n",
215222
"@track\n",
@@ -227,7 +234,7 @@
227234
"@track(name=\"Compute Ragas metric score\", capture_input=False)\n",
228235
"def compute_rag_score(answer_relevancy_metric, question, answer, contexts):\n",
229236
" # Define the score function\n",
230-
" row = {\"question\": question, \"answer\": answer, \"contexts\": contexts}\n",
237+
" row = {\"user_input\": question, \"response\": answer, \"retrieved_contexts\": contexts}\n",
231238
" score = compute_metric(answer_relevancy_metric, row)\n",
232239
" return score\n",
233240
"\n",
@@ -238,9 +245,10 @@
238245
" contexts = retrieve_contexts(question)\n",
239246
" answer = answer_question(question, contexts)\n",
240247
"\n",
241-
" trace = get_current_trace()\n",
242248
" score = compute_rag_score(answer_relevancy_metric, question, answer, contexts)\n",
243-
" trace.log_feedback_score(\"answer_relevancy\", round(score, 4), category_name=\"ragas\")\n",
249+
" update_current_trace(\n",
250+
" feedback_scores=[{\"name\": \"answer_relevancy\", \"value\": round(score, 4)}]\n",
251+
" )\n",
244252
"\n",
245253
" return answer\n",
246254
"\n",
@@ -261,25 +269,18 @@
261269
},
262270
{
263271
"cell_type": "code",
264-
"execution_count": 6,
272+
"execution_count": 8,
265273
"metadata": {},
266274
"outputs": [
267-
{
268-
"name": "stderr",
269-
"output_type": "stream",
270-
"text": [
271-
"passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`\n"
272-
]
273-
},
274275
{
275276
"data": {
276277
"application/vnd.jupyter.widget-view+json": {
277-
"model_id": "985d2e27ce8a48daad673666e6e6e953",
278+
"model_id": "07abcf96a39b4fd183756d5dc3b617c9",
278279
"version_major": 2,
279280
"version_minor": 0
280281
},
281282
"text/plain": [
282-
"Evaluating: 0%| | 0/9 [00:00<?, ?it/s]"
283+
"Evaluating: 0%| | 0/6 [00:00<?, ?it/s]"
283284
]
284285
},
285286
"metadata": {},
@@ -289,7 +290,7 @@
289290
"name": "stdout",
290291
"output_type": "stream",
291292
"text": [
292-
"{'context_precision': 1.0000, 'faithfulness': 0.8250, 'answer_relevancy': 0.9755}\n"
293+
"{'context_precision': 1.0000, 'faithfulness': 0.7375, 'answer_relevancy': 0.9889}\n"
293294
]
294295
}
295296
],
@@ -301,10 +302,21 @@
301302
"\n",
302303
"fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
303304
"\n",
305+
"# Reformat the dataset to match the schema expected by the Ragas evaluate function\n",
306+
"dataset = fiqa_eval[\"baseline\"].select(range(3))\n",
307+
"\n",
308+
"dataset = dataset.map(\n",
309+
" lambda x: {\n",
310+
" \"user_input\": x[\"question\"],\n",
311+
" \"reference\": x[\"ground_truths\"][0],\n",
312+
" \"retrieved_contexts\": x[\"contexts\"],\n",
313+
" }\n",
314+
")\n",
315+
"\n",
304316
"opik_tracer_eval = OpikTracer(tags=[\"ragas_eval\"], metadata={\"evaluation_run\": True})\n",
305317
"\n",
306318
"result = evaluate(\n",
307-
" fiqa_eval[\"baseline\"].select(range(3)),\n",
319+
" dataset,\n",
308320
" metrics=[context_precision, faithfulness, answer_relevancy],\n",
309321
" callbacks=[opik_tracer_eval],\n",
310322
")\n",

src/ragas/integrations/opik.py

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import typing as t
22

33
try:
4-
from opik.integrations.langchain import ( # type: ignore
4+
from opik.integrations.langchain import (
55
OpikTracer as LangchainOpikTracer,
6-
)
6+
) # type: ignore
77

88
from ragas.evaluation import RAGAS_EVALUATION_CHAIN_NAME
99
except ImportError:
@@ -29,37 +29,34 @@ class OpikTracer(LangchainOpikTracer):
2929

3030
_evaluation_run_id: t.Optional[str] = None
3131

32-
def _persist_run(self, run: "Run"):
33-
# The _persist_run function is called by LangChain if it is a root run,
34-
# we update it so that we don't log the root run if we are running an evaluation.
35-
if run.id != self._evaluation_run_id:
36-
super()._persist_run(run)
37-
38-
def _on_chain_start(self, run: "Run"):
32+
def _process_start_trace(self, run: "Run"):
3933
if (run.parent_run_id is None) and (run.name == RAGAS_EVALUATION_CHAIN_NAME):
4034
# Store the evaluation run id so we can flag the child traces and log them independently
41-
self._evaluation_run_id = str(run.id)
35+
self._evaluation_run_id = run.id
4236
else:
43-
# Each child trace of the "ragas evaluation" chain should be a new trace
4437
if run.parent_run_id == self._evaluation_run_id:
4538
run.parent_run_id = None
4639

47-
super()._on_chain_start(run)
40+
super()._process_start_trace(run)
4841

49-
def _on_chain_end(self, run: "Run"):
50-
if run.id == self._evaluation_run_id:
51-
pass
52-
else:
53-
# We want to log the output row chain as feedback scores as these align with the Opik terminology of "feedback scores"
54-
if run.name.startswith("row ") and (self._evaluation_run_id is not None):
55-
span = self._span_map[run.id]
56-
trace_id = span.trace_id
42+
def _process_end_trace(self, run: "Run"):
43+
if run.id != self._evaluation_run_id:
44+
if run.name.startswith("row "):
45+
trace_data = self._created_traces_data_map[run.id]
5746
if run.outputs:
5847
self._opik_client.log_traces_feedback_scores(
5948
[
60-
{"id": trace_id, "name": name, "value": round(value, 4)}
49+
{
50+
"id": trace_data.id,
51+
"name": name,
52+
"value": round(value, 4),
53+
}
6154
for name, value in run.outputs.items()
6255
]
6356
)
6457

65-
self._persist_run(run)
58+
super()._process_end_trace(run)
59+
60+
def _persist_run(self, run: "Run"):
61+
if run.id != self._evaluation_run_id:
62+
super()._persist_run(run)

0 commit comments

Comments
 (0)