diff --git a/docs/howtos/integrations/_ag_ui.md b/docs/howtos/integrations/_ag_ui.md
index cf9e056a5..4d3308346 100644
--- a/docs/howtos/integrations/_ag_ui.md
+++ b/docs/howtos/integrations/_ag_ui.md
@@ -26,7 +26,7 @@ import asyncio
from dotenv import load_dotenv
import nest_asyncio
from IPython.display import display
-from langchain_openai import ChatOpenAI
+from openai import AsyncOpenAI
from ragas.dataset_schema import EvaluationDataset, SingleTurnSample, MultiTurnSample
from ragas.integrations.ag_ui import (
@@ -35,8 +35,14 @@ from ragas.integrations.ag_ui import (
convert_messages_snapshot,
)
from ragas.messages import HumanMessage, ToolCall
-from ragas.metrics import FactualCorrectness, ToolCallF1
-from ragas.llms import LangchainLLMWrapper
+from ragas.metrics import ToolCallF1
+from ragas.metrics.collections import (
+ ContextPrecisionWithReference,
+ ContextRecall,
+ FactualCorrectness,
+ ResponseGroundedness,
+)
+from ragas.llms import llm_factory
from ag_ui.core import (
MessagesSnapshotEvent,
TextMessageChunkEvent,
@@ -109,21 +115,24 @@ weather_queries
## Configure metrics and the evaluator LLM
-Wrap your grading model with the appropriate adapter and instantiate the metrics you plan to use.
+Create an Instructor-compatible grading model with `llm_factory` and instantiate the metrics you plan to use.
```python
-evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
-
-qa_metrics = [FactualCorrectness(llm=evaluator_llm)]
+client = AsyncOpenAI()
+evaluator_llm = llm_factory("gpt-4o-mini", client=client)
+
+qa_metrics = [
+ FactualCorrectness(llm=evaluator_llm, mode="f1"),
+ ContextPrecisionWithReference(llm=evaluator_llm),
+ ContextRecall(llm=evaluator_llm),
+ ResponseGroundedness(llm=evaluator_llm),
+]
tool_metrics = [ToolCallF1()] # rule-based, no LLM required
```
- /var/folders/8k/tf3xr1rd1fl_dz35dfhfp_tc0000gn/T/ipykernel_93918/2135722072.py:1: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))
- evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
-
## Evaluate a live AG-UI endpoint
Set the endpoint URL exposed by your agent. Toggle the flags when you are ready to run the evaluations.
@@ -189,6 +198,9 @@ if RUN_FACTUAL_EVAL:
response |
reference |
factual_correctness(mode=f1) |
+ context_precision_with_reference |
+ context_recall |
+ response_groundedness |
@@ -199,6 +211,9 @@ if RUN_FACTUAL_EVAL:
The theory of relativity was originated by Alb... |
Albert Einstein originated the theory of relat... |
0.33 |
+ 0.50 |
+ 0.75 |
+ 0.80 |
| 1 |
@@ -207,6 +222,9 @@ if RUN_FACTUAL_EVAL:
Penicillin was discovered by Alexander Fleming... |
Alexander Fleming discovered penicillin in 1928. |
1.00 |
+ 0.75 |
+ 1.00 |
+ 0.95 |
diff --git a/docs/howtos/integrations/ag_ui.md b/docs/howtos/integrations/ag_ui.md
index 353a8445e..6cab35392 100644
--- a/docs/howtos/integrations/ag_ui.md
+++ b/docs/howtos/integrations/ag_ui.md
@@ -78,16 +78,28 @@ weather_queries = EvaluationDataset(
## Choose metrics and evaluator model
-The integration works with any Ragas metric. For most text-based evaluations you will want a grading LLM. Wrap your model with the appropriate adapter (LangChain shown here, but llama-index and LiteLLM wrappers work as well).
+The integration works with any Ragas metric. To unlock the modern collections portfolio, build an Instructor-compatible LLM with `llm_factory`.
```python
-from ragas.metrics import FactualCorrectness, ToolCallF1
-from ragas.llms import LangchainLLMWrapper
-from langchain_openai import ChatOpenAI
+from openai import AsyncOpenAI
+from ragas.llms import llm_factory
+from ragas.metrics import ToolCallF1
+from ragas.metrics.collections import (
+ ContextPrecisionWithReference,
+ ContextRecall,
+ FactualCorrectness,
+ ResponseGroundedness,
+)
-evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
+client = AsyncOpenAI()
+evaluator_llm = llm_factory("gpt-4o-mini", client=client)
-qa_metrics = [FactualCorrectness(llm=evaluator_llm)]
+qa_metrics = [
+ FactualCorrectness(llm=evaluator_llm, mode="f1"),
+ ContextPrecisionWithReference(llm=evaluator_llm),
+ ContextRecall(llm=evaluator_llm),
+ ResponseGroundedness(llm=evaluator_llm),
+]
tool_metrics = [ToolCallF1()] # rule-based metric, no LLM required
```
diff --git a/examples/ragas_examples/ag_ui_agent_evals/README.md b/examples/ragas_examples/ag_ui_agent_evals/README.md
index 0846b6b48..b8f64ad32 100644
--- a/examples/ragas_examples/ag_ui_agent_evals/README.md
+++ b/examples/ragas_examples/ag_ui_agent_evals/README.md
@@ -36,18 +36,18 @@ Install the required dependencies:
uv pip install -e ".[dev]"
# Or install specific dependencies
-pip install ragas langchain-openai
+pip install ragas openai
```
## Evaluation Scenarios
This example includes two evaluation scenarios:
-### 1. Scientist Biographies (Factual Correctness)
+### 1. Scientist Biographies (Factuality & Grounding)
-Tests the agent's ability to provide factually correct information about famous scientists.
+Tests the agent's ability to provide factually correct information about famous scientists and ground its answers in retrieved evidence.
-- **Metric**: `FactualCorrectness` - Measures how accurate the agent's responses are compared to reference answers
+- **Metrics**: Collections metrics — `FactualCorrectness`, `ContextPrecisionWithReference`, `ContextRecall`, `ResponseGroundedness`
- **Dataset**: `test_data/scientist_biographies.csv` - 5 questions about scientists (Einstein, Fleming, Newton, etc.)
- **Sample Type**: `SingleTurnSample` - Simple question-answer pairs
@@ -120,13 +120,16 @@ Evaluating against endpoint: http://localhost:8000/agentic_chat
================================================================================
Scientist Biographies Evaluation Results
================================================================================
- user_input ... factual_correctness(mode=f1)
-0 Who originated the theory of relativity... ... 0.75
-1 Who discovered penicillin and when... ... 1.00
+ user_input ... response_groundedness
+0 Who originated the theory of relativity... ... 0.83
+1 Who discovered penicillin and when... ... 1.00
...
Average Factual Correctness: 0.7160
-Perfect scores (1.0): 2/5
+Average Context Precision: 0.6500
+Average Context Recall: 0.7200
+Average Response Groundedness: 0.7800
+Perfect factual scores (1.0): 2/5
Results saved to: .../scientist_biographies_results_20250101_143022.csv
@@ -155,8 +158,8 @@ Results are saved as timestamped CSV files:
Example CSV structure:
```csv
-user_input,response,reference,factual_correctness(mode=f1)
-"Who originated the theory of relativity...","Albert Einstein...","Albert Einstein originated...",0.75
+user_input,response,reference,factual_correctness(mode=f1),context_precision_with_reference,context_recall,response_groundedness
+"Who originated the theory of relativity...","Albert Einstein...","Albert Einstein originated...",0.75,0.50,0.75,0.83
```
## Customizing the Evaluation
@@ -183,15 +186,16 @@ user_input,reference_tool_calls
### Using Different Metrics
-Modify `evals.py` to include additional Ragas metrics:
+Modify `evals.py` to include additional collections metrics:
```python
-from ragas.metrics import AnswerRelevancy, ContextPrecision
+from ragas.metrics.collections import AnswerRelevancy, ContextPrecisionWithoutReference
# In evaluate_scientist_biographies function:
metrics = [
- FactualCorrectness(),
- AnswerRelevancy(), # Add additional metrics
+ AnswerRelevancy(llm=evaluator_llm),
+ ContextPrecisionWithoutReference(llm=evaluator_llm),
+ ResponseGroundedness(llm=evaluator_llm),
]
```
diff --git a/examples/ragas_examples/ag_ui_agent_evals/__init__.py b/examples/ragas_examples/ag_ui_agent_evals/__init__.py
index 7b75b49c7..b0c223b1e 100644
--- a/examples/ragas_examples/ag_ui_agent_evals/__init__.py
+++ b/examples/ragas_examples/ag_ui_agent_evals/__init__.py
@@ -37,11 +37,13 @@
This package includes two evaluation scenarios:
-1. **Scientist Biographies** - Tests factual correctness of agent responses
- using the FactualCorrectness metric with SingleTurnSample datasets.
+1. **Scientist Biographies** - Uses the modern collections metrics
+ (`FactualCorrectness`, `ContextPrecisionWithReference`, `ContextRecall`,
+ `ResponseGroundedness`) with `SingleTurnSample` datasets to score factuality
+ and grounding in one pass.
-2. **Weather Tool Usage** - Tests tool calling accuracy using the ToolCallF1
- metric with MultiTurnSample datasets.
+2. **Weather Tool Usage** - Tests tool calling accuracy using the `ToolCallF1`
+ metric with `MultiTurnSample` datasets.
## Results
diff --git a/examples/ragas_examples/ag_ui_agent_evals/evals.py b/examples/ragas_examples/ag_ui_agent_evals/evals.py
index fbf822917..15bac7191 100644
--- a/examples/ragas_examples/ag_ui_agent_evals/evals.py
+++ b/examples/ragas_examples/ag_ui_agent_evals/evals.py
@@ -26,17 +26,23 @@
from pathlib import Path
from typing import List
-from langchain_openai import ChatOpenAI
-
+from openai import AsyncOpenAI
from ragas.dataset_schema import (
EvaluationDataset,
MultiTurnSample,
SingleTurnSample,
)
from ragas.integrations.ag_ui import evaluate_ag_ui_agent
-from ragas.llms import LangchainLLMWrapper
+from ragas.llms import llm_factory
+from ragas.llms.base import InstructorBaseRagasLLM
from ragas.messages import HumanMessage, ToolCall
-from ragas.metrics import FactualCorrectness, ToolCallF1
+from ragas.metrics import ToolCallF1
+from ragas.metrics.collections import (
+ ContextPrecisionWithReference,
+ ContextRecall,
+ FactualCorrectness,
+ ResponseGroundedness,
+)
# Configure logging
logging.basicConfig(
@@ -104,7 +110,7 @@ def load_weather_dataset() -> EvaluationDataset:
async def evaluate_scientist_biographies(
- endpoint_url: str, evaluator_llm: LangchainLLMWrapper
+ endpoint_url: str, evaluator_llm: InstructorBaseRagasLLM
) -> tuple:
"""
Evaluate the agent's ability to provide factually correct information
@@ -125,8 +131,13 @@ async def evaluate_scientist_biographies(
# Load dataset
dataset = load_scientist_dataset()
- # Define metrics
- metrics = [FactualCorrectness()]
+ # Define metrics using the modern collections portfolio
+ metrics = [
+ FactualCorrectness(llm=evaluator_llm, mode="f1"),
+ ContextPrecisionWithReference(llm=evaluator_llm),
+ ContextRecall(llm=evaluator_llm),
+ ResponseGroundedness(llm=evaluator_llm),
+ ]
# Run evaluation
logger.info(f"Evaluating against endpoint: {endpoint_url}")
@@ -148,18 +159,26 @@ async def evaluate_scientist_biographies(
logger.info(f"\nDataFrame shape: {df.shape}")
logger.info(f"\n{df.to_string()}")
+ metric_columns = [
+ "factual_correctness(mode=f1)",
+ "context_precision_with_reference",
+ "context_recall",
+ "response_groundedness",
+ ]
+ for column in metric_columns:
+ if column in df.columns:
+ logger.info(f"Average {column}: {df[column].mean():.4f}")
+
if "factual_correctness(mode=f1)" in df.columns:
- avg_correctness = df["factual_correctness(mode=f1)"].mean()
- logger.info(f"\nAverage Factual Correctness: {avg_correctness:.4f}")
logger.info(
- f"Perfect scores (1.0): {(df['factual_correctness(mode=f1)'] == 1.0).sum()}/{len(df)}"
+ f"Perfect factual scores (1.0): {(df['factual_correctness(mode=f1)'] == 1.0).sum()}/{len(df)}"
)
return result, df
async def evaluate_weather_tool_use(
- endpoint_url: str, evaluator_llm: LangchainLLMWrapper
+ endpoint_url: str, evaluator_llm: InstructorBaseRagasLLM
) -> tuple:
"""
Evaluate the agent's ability to correctly call the weather tool.
@@ -278,8 +297,8 @@ async def main():
# Setup evaluator LLM
logger.info(f"Setting up evaluator LLM: {args.evaluator_model}")
- llm = ChatOpenAI(model=args.evaluator_model)
- evaluator_llm = LangchainLLMWrapper(llm)
+ client = AsyncOpenAI()
+ evaluator_llm = llm_factory(args.evaluator_model, client=client)
# Run evaluations
try:
diff --git a/src/ragas/integrations/ag_ui.py b/src/ragas/integrations/ag_ui.py
index 69bc928dd..f9bc458e5 100644
--- a/src/ragas/integrations/ag_ui.py
+++ b/src/ragas/integrations/ag_ui.py
@@ -31,16 +31,21 @@
from ragas.integrations.ag_ui import evaluate_ag_ui_agent
from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
- from ragas.metrics import AspectCritic
+ from ragas.metrics.collections import FactualCorrectness
+ from ragas.llms import llm_factory
+ from openai import AsyncOpenAI
+
+ client = AsyncOpenAI()
+ evaluator_llm = llm_factory("gpt-4o-mini", client=client)
dataset = EvaluationDataset(samples=[
- SingleTurnSample(user_input="What's the weather in SF?")
+ SingleTurnSample(user_input="What's the weather in SF?", reference="Use the weather API")
])
result = await evaluate_ag_ui_agent(
endpoint_url="http://localhost:8000/agent",
dataset=dataset,
- metrics=[AspectCritic()]
+ metrics=[FactualCorrectness(llm=evaluator_llm)]
)
Evaluate with multi-turn conversations and tool calls::
@@ -66,6 +71,7 @@
from __future__ import annotations
+import inspect
import json
import logging
import math
@@ -84,12 +90,53 @@
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
from ragas.run_config import RunConfig
+try:
+ from ragas.metrics.collections.base import BaseMetric as CollectionsBaseMetric
+except ImportError: # pragma: no cover - collections are part of ragas, but guard just in case
+ CollectionsBaseMetric = t.cast(t.Type[object], None)
+
+from ragas.metrics.base import Metric, MetricType, SingleTurnMetric
+
if t.TYPE_CHECKING:
- from ragas.metrics.base import Metric
+ from ragas.metrics.collections.base import BaseMetric as _CollectionsBaseMetric
logger = logging.getLogger(__name__)
+def _is_collections_metric(metric: Any) -> bool:
+ """Return True if the metric originates from the collections portfolio."""
+
+ return CollectionsBaseMetric is not None and isinstance(metric, CollectionsBaseMetric)
+
+
+class _CollectionsSingleTurnMetricAdapter(SingleTurnMetric):
+ """Adapter that lets collections metrics participate in ragas.evaluate."""
+
+ def __init__(self, metric: CollectionsBaseMetric):
+ self._metric = metric
+ self.name = metric.name
+ self._parameter_names = [
+ name
+ for name in inspect.signature(metric.ascore).parameters.keys()
+ if name != "self"
+ ]
+ required_columns = set(self._parameter_names)
+ self.required_columns = {MetricType.SINGLE_TURN: required_columns}
+
+ def init(self, run_config: RunConfig) -> None: # pragma: no cover - no-op for collections
+ """Collections metrics manage their own initialization."""
+
+ async def _single_turn_ascore(
+ self, sample: SingleTurnSample, callbacks: Optional[Any]
+ ) -> float:
+ kwargs = {}
+ for param in self._parameter_names:
+ kwargs[param] = getattr(sample, param, None)
+
+ result = await self._metric.ascore(**kwargs)
+ return result.value
+
+
# Lazy imports for ag_ui to avoid hard dependency
def _import_ag_ui_core():
"""Import AG-UI core types with helpful error message."""
@@ -1035,10 +1082,34 @@ async def _call_ag_ui_endpoint(
return events
+def _prepare_metrics_for_evaluation(
+ metrics: t.Sequence[Union[Metric, "_CollectionsBaseMetric"]],
+ is_multi_turn: bool,
+) -> t.List[Metric]:
+ """Normalize metrics so ragas.evaluate can consume them."""
+
+ prepared: t.List[Metric] = []
+ for metric in metrics:
+ if isinstance(metric, Metric):
+ prepared.append(metric)
+ elif _is_collections_metric(metric):
+ if is_multi_turn:
+ raise ValueError(
+ "Collections metrics currently support only single-turn datasets in the AG-UI integration."
+ )
+ prepared.append(_CollectionsSingleTurnMetricAdapter(metric))
+ else:
+ raise TypeError(
+ "Metrics must be Ragas Metric instances or collections metrics."
+ )
+
+ return prepared
+
+
async def evaluate_ag_ui_agent(
endpoint_url: str,
dataset: EvaluationDataset,
- metrics: List["Metric"],
+ metrics: List[Union[Metric, "_CollectionsBaseMetric"]],
metadata: bool = False,
run_config: Optional[RunConfig] = None,
batch_size: Optional[int] = None,
@@ -1069,8 +1140,8 @@ async def evaluate_ag_ui_agent(
Dataset containing test queries. Can contain either:
- SingleTurnSample: user_input as string
- MultiTurnSample: user_input as list of messages
- metrics : List[Metric]
- List of Ragas metrics to evaluate (e.g., AspectCritic, ToolCallF1).
+ metrics : List[Metric or collections.BaseMetric]
+ List of Ragas metrics to evaluate (e.g., ResponseGroundedness, ToolCallF1).
metadata : bool, optional
Whether to include AG-UI metadata in converted messages (default: False).
run_config : RunConfig, optional
@@ -1107,8 +1178,15 @@ async def evaluate_ag_ui_agent(
>>> from ragas.integrations.ag_ui import evaluate_ag_ui_agent
>>> from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
- >>> from ragas.metrics import AspectCritic, Faithfulness
+ >>> from ragas.metrics.collections import (
+ ... ContextPrecisionWithReference,
+ ... FactualCorrectness,
+ ... )
+ >>> from ragas.llms import llm_factory
+ >>> from openai import AsyncOpenAI
>>>
+ >>> client = AsyncOpenAI()
+ >>> evaluator_llm = llm_factory("gpt-4o-mini", client=client)
>>> dataset = EvaluationDataset(samples=[
... SingleTurnSample(
... user_input="What's the weather in San Francisco?",
@@ -1119,7 +1197,10 @@ async def evaluate_ag_ui_agent(
>>> result = await evaluate_ag_ui_agent(
... endpoint_url="http://localhost:8000/agent",
... dataset=dataset,
- ... metrics=[AspectCritic(), Faithfulness()]
+ ... metrics=[
+ ... FactualCorrectness(llm=evaluator_llm),
+ ... ContextPrecisionWithReference(llm=evaluator_llm),
+ ... ]
... )
With AG-UI metadata included::
@@ -1127,7 +1208,7 @@ async def evaluate_ag_ui_agent(
>>> result = await evaluate_ag_ui_agent(
... endpoint_url="http://localhost:8000/agent",
... dataset=dataset,
- ... metrics=[AspectCritic()],
+ ... metrics=[FactualCorrectness(llm=evaluator_llm)],
... metadata=True # Include run_id, thread_id, etc.
... )
@@ -1174,6 +1255,7 @@ async def evaluate_ag_ui_agent(
# Support both single-turn and multi-turn evaluations
is_multi_turn = dataset.is_multi_turn()
+ prepared_metrics = _prepare_metrics_for_evaluation(metrics, is_multi_turn)
if is_multi_turn:
samples = t.cast(List[MultiTurnSample], dataset.samples)
else:
@@ -1300,7 +1382,7 @@ async def evaluate_ag_ui_agent(
# Run evaluation with metrics
evaluation_result = ragas_evaluate(
dataset=dataset,
- metrics=metrics,
+ metrics=prepared_metrics,
raise_exceptions=raise_exceptions,
show_progress=show_progress,
run_config=run_config or RunConfig(),