Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 28 additions & 10 deletions docs/howtos/integrations/_ag_ui.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import asyncio
from dotenv import load_dotenv
import nest_asyncio
from IPython.display import display
from langchain_openai import ChatOpenAI
from openai import AsyncOpenAI

from ragas.dataset_schema import EvaluationDataset, SingleTurnSample, MultiTurnSample
from ragas.integrations.ag_ui import (
Expand All @@ -35,8 +35,14 @@ from ragas.integrations.ag_ui import (
convert_messages_snapshot,
)
from ragas.messages import HumanMessage, ToolCall
from ragas.metrics import FactualCorrectness, ToolCallF1
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import ToolCallF1
from ragas.metrics.collections import (
ContextPrecisionWithReference,
ContextRecall,
FactualCorrectness,
ResponseGroundedness,
)
from ragas.llms import llm_factory
from ag_ui.core import (
MessagesSnapshotEvent,
TextMessageChunkEvent,
Expand Down Expand Up @@ -109,21 +115,24 @@ weather_queries


## Configure metrics and the evaluator LLM
Wrap your grading model with the appropriate adapter and instantiate the metrics you plan to use.
Create an Instructor-compatible grading model with `llm_factory` and instantiate the metrics you plan to use.



```python
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))

qa_metrics = [FactualCorrectness(llm=evaluator_llm)]
client = AsyncOpenAI()
evaluator_llm = llm_factory("gpt-4o-mini", client=client)

qa_metrics = [
FactualCorrectness(llm=evaluator_llm, mode="f1"),
ContextPrecisionWithReference(llm=evaluator_llm),
ContextRecall(llm=evaluator_llm),
ResponseGroundedness(llm=evaluator_llm),
]
tool_metrics = [ToolCallF1()] # rule-based, no LLM required

```

/var/folders/8k/tf3xr1rd1fl_dz35dfhfp_tc0000gn/T/ipykernel_93918/2135722072.py:1: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))


## Evaluate a live AG-UI endpoint
Set the endpoint URL exposed by your agent. Toggle the flags when you are ready to run the evaluations.
Expand Down Expand Up @@ -189,6 +198,9 @@ if RUN_FACTUAL_EVAL:
<th>response</th>
<th>reference</th>
<th>factual_correctness(mode=f1)</th>
<th>context_precision_with_reference</th>
<th>context_recall</th>
<th>response_groundedness</th>
</tr>
</thead>
<tbody>
Expand All @@ -199,6 +211,9 @@ if RUN_FACTUAL_EVAL:
<td>The theory of relativity was originated by Alb...</td>
<td>Albert Einstein originated the theory of relat...</td>
<td>0.33</td>
<td>0.50</td>
<td>0.75</td>
<td>0.80</td>
</tr>
<tr>
<th>1</th>
Expand All @@ -207,6 +222,9 @@ if RUN_FACTUAL_EVAL:
<td>Penicillin was discovered by Alexander Fleming...</td>
<td>Alexander Fleming discovered penicillin in 1928.</td>
<td>1.00</td>
<td>0.75</td>
<td>1.00</td>
<td>0.95</td>
</tr>
</tbody>
</table>
Expand Down
24 changes: 18 additions & 6 deletions docs/howtos/integrations/ag_ui.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,28 @@ weather_queries = EvaluationDataset(

## Choose metrics and evaluator model

The integration works with any Ragas metric. For most text-based evaluations you will want a grading LLM. Wrap your model with the appropriate adapter (LangChain shown here, but llama-index and LiteLLM wrappers work as well).
The integration works with any Ragas metric. To unlock the modern collections portfolio, build an Instructor-compatible LLM with `llm_factory`.

```python
from ragas.metrics import FactualCorrectness, ToolCallF1
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
from openai import AsyncOpenAI
from ragas.llms import llm_factory
from ragas.metrics import ToolCallF1
from ragas.metrics.collections import (
ContextPrecisionWithReference,
ContextRecall,
FactualCorrectness,
ResponseGroundedness,
)

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
client = AsyncOpenAI()
evaluator_llm = llm_factory("gpt-4o-mini", client=client)

qa_metrics = [FactualCorrectness(llm=evaluator_llm)]
qa_metrics = [
FactualCorrectness(llm=evaluator_llm, mode="f1"),
ContextPrecisionWithReference(llm=evaluator_llm),
ContextRecall(llm=evaluator_llm),
ResponseGroundedness(llm=evaluator_llm),
]
tool_metrics = [ToolCallF1()] # rule-based metric, no LLM required
```

Expand Down
32 changes: 18 additions & 14 deletions examples/ragas_examples/ag_ui_agent_evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,18 @@ Install the required dependencies:
uv pip install -e ".[dev]"

# Or install specific dependencies
pip install ragas langchain-openai
pip install ragas openai
```

## Evaluation Scenarios

This example includes two evaluation scenarios:

### 1. Scientist Biographies (Factual Correctness)
### 1. Scientist Biographies (Factuality & Grounding)

Tests the agent's ability to provide factually correct information about famous scientists.
Tests the agent's ability to provide factually correct information about famous scientists and ground its answers in retrieved evidence.

- **Metric**: `FactualCorrectness` - Measures how accurate the agent's responses are compared to reference answers
- **Metrics**: Collections metrics — `FactualCorrectness`, `ContextPrecisionWithReference`, `ContextRecall`, `ResponseGroundedness`
- **Dataset**: `test_data/scientist_biographies.csv` - 5 questions about scientists (Einstein, Fleming, Newton, etc.)
- **Sample Type**: `SingleTurnSample` - Simple question-answer pairs

Expand Down Expand Up @@ -120,13 +120,16 @@ Evaluating against endpoint: http://localhost:8000/agentic_chat
================================================================================
Scientist Biographies Evaluation Results
================================================================================
user_input ... factual_correctness(mode=f1)
0 Who originated the theory of relativity... ... 0.75
1 Who discovered penicillin and when... ... 1.00
user_input ... response_groundedness
0 Who originated the theory of relativity... ... 0.83
1 Who discovered penicillin and when... ... 1.00
...

Average Factual Correctness: 0.7160
Perfect scores (1.0): 2/5
Average Context Precision: 0.6500
Average Context Recall: 0.7200
Average Response Groundedness: 0.7800
Perfect factual scores (1.0): 2/5

Results saved to: .../scientist_biographies_results_20250101_143022.csv

Expand Down Expand Up @@ -155,8 +158,8 @@ Results are saved as timestamped CSV files:
Example CSV structure:

```csv
user_input,response,reference,factual_correctness(mode=f1)
"Who originated the theory of relativity...","Albert Einstein...","Albert Einstein originated...",0.75
user_input,response,reference,factual_correctness(mode=f1),context_precision_with_reference,context_recall,response_groundedness
"Who originated the theory of relativity...","Albert Einstein...","Albert Einstein originated...",0.75,0.50,0.75,0.83
```

## Customizing the Evaluation
Expand All @@ -183,15 +186,16 @@ user_input,reference_tool_calls

### Using Different Metrics

Modify `evals.py` to include additional Ragas metrics:
Modify `evals.py` to include additional collections metrics:

```python
from ragas.metrics import AnswerRelevancy, ContextPrecision
from ragas.metrics.collections import AnswerRelevancy, ContextPrecisionWithoutReference

# In evaluate_scientist_biographies function:
metrics = [
FactualCorrectness(),
AnswerRelevancy(), # Add additional metrics
AnswerRelevancy(llm=evaluator_llm),
ContextPrecisionWithoutReference(llm=evaluator_llm),
ResponseGroundedness(llm=evaluator_llm),
]
```

Expand Down
10 changes: 6 additions & 4 deletions examples/ragas_examples/ag_ui_agent_evals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@

This package includes two evaluation scenarios:

1. **Scientist Biographies** - Tests factual correctness of agent responses
using the FactualCorrectness metric with SingleTurnSample datasets.
1. **Scientist Biographies** - Uses the modern collections metrics
(`FactualCorrectness`, `ContextPrecisionWithReference`, `ContextRecall`,
`ResponseGroundedness`) with `SingleTurnSample` datasets to score factuality
and grounding in one pass.

2. **Weather Tool Usage** - Tests tool calling accuracy using the ToolCallF1
metric with MultiTurnSample datasets.
2. **Weather Tool Usage** - Tests tool calling accuracy using the `ToolCallF1`
metric with `MultiTurnSample` datasets.

## Results

Expand Down
45 changes: 32 additions & 13 deletions examples/ragas_examples/ag_ui_agent_evals/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,23 @@
from pathlib import Path
from typing import List

from langchain_openai import ChatOpenAI

from openai import AsyncOpenAI
from ragas.dataset_schema import (
EvaluationDataset,
MultiTurnSample,
SingleTurnSample,
)
from ragas.integrations.ag_ui import evaluate_ag_ui_agent
from ragas.llms import LangchainLLMWrapper
from ragas.llms import llm_factory
from ragas.llms.base import InstructorBaseRagasLLM
from ragas.messages import HumanMessage, ToolCall
from ragas.metrics import FactualCorrectness, ToolCallF1
from ragas.metrics import ToolCallF1
from ragas.metrics.collections import (
ContextPrecisionWithReference,
ContextRecall,
FactualCorrectness,
ResponseGroundedness,
)

# Configure logging
logging.basicConfig(
Expand Down Expand Up @@ -104,7 +110,7 @@ def load_weather_dataset() -> EvaluationDataset:


async def evaluate_scientist_biographies(
endpoint_url: str, evaluator_llm: LangchainLLMWrapper
endpoint_url: str, evaluator_llm: InstructorBaseRagasLLM
) -> tuple:
"""
Evaluate the agent's ability to provide factually correct information
Expand All @@ -125,8 +131,13 @@ async def evaluate_scientist_biographies(
# Load dataset
dataset = load_scientist_dataset()

# Define metrics
metrics = [FactualCorrectness()]
# Define metrics using the modern collections portfolio
metrics = [
FactualCorrectness(llm=evaluator_llm, mode="f1"),
ContextPrecisionWithReference(llm=evaluator_llm),
ContextRecall(llm=evaluator_llm),
ResponseGroundedness(llm=evaluator_llm),
]

# Run evaluation
logger.info(f"Evaluating against endpoint: {endpoint_url}")
Expand All @@ -148,18 +159,26 @@ async def evaluate_scientist_biographies(
logger.info(f"\nDataFrame shape: {df.shape}")
logger.info(f"\n{df.to_string()}")

metric_columns = [
"factual_correctness(mode=f1)",
"context_precision_with_reference",
"context_recall",
"response_groundedness",
]
for column in metric_columns:
if column in df.columns:
logger.info(f"Average {column}: {df[column].mean():.4f}")

if "factual_correctness(mode=f1)" in df.columns:
avg_correctness = df["factual_correctness(mode=f1)"].mean()
logger.info(f"\nAverage Factual Correctness: {avg_correctness:.4f}")
logger.info(
f"Perfect scores (1.0): {(df['factual_correctness(mode=f1)'] == 1.0).sum()}/{len(df)}"
f"Perfect factual scores (1.0): {(df['factual_correctness(mode=f1)'] == 1.0).sum()}/{len(df)}"
)

return result, df


async def evaluate_weather_tool_use(
endpoint_url: str, evaluator_llm: LangchainLLMWrapper
endpoint_url: str, evaluator_llm: InstructorBaseRagasLLM
) -> tuple:
"""
Evaluate the agent's ability to correctly call the weather tool.
Expand Down Expand Up @@ -278,8 +297,8 @@ async def main():

# Setup evaluator LLM
logger.info(f"Setting up evaluator LLM: {args.evaluator_model}")
llm = ChatOpenAI(model=args.evaluator_model)
evaluator_llm = LangchainLLMWrapper(llm)
client = AsyncOpenAI()
evaluator_llm = llm_factory(args.evaluator_model, client=client)

# Run evaluations
try:
Expand Down
Loading