diff --git a/docs/howtos/integrations/_ag_ui.md b/docs/howtos/integrations/_ag_ui.md index cf9e056a5..4d3308346 100644 --- a/docs/howtos/integrations/_ag_ui.md +++ b/docs/howtos/integrations/_ag_ui.md @@ -26,7 +26,7 @@ import asyncio from dotenv import load_dotenv import nest_asyncio from IPython.display import display -from langchain_openai import ChatOpenAI +from openai import AsyncOpenAI from ragas.dataset_schema import EvaluationDataset, SingleTurnSample, MultiTurnSample from ragas.integrations.ag_ui import ( @@ -35,8 +35,14 @@ from ragas.integrations.ag_ui import ( convert_messages_snapshot, ) from ragas.messages import HumanMessage, ToolCall -from ragas.metrics import FactualCorrectness, ToolCallF1 -from ragas.llms import LangchainLLMWrapper +from ragas.metrics import ToolCallF1 +from ragas.metrics.collections import ( + ContextPrecisionWithReference, + ContextRecall, + FactualCorrectness, + ResponseGroundedness, +) +from ragas.llms import llm_factory from ag_ui.core import ( MessagesSnapshotEvent, TextMessageChunkEvent, @@ -109,21 +115,24 @@ weather_queries ## Configure metrics and the evaluator LLM -Wrap your grading model with the appropriate adapter and instantiate the metrics you plan to use. +Create an Instructor-compatible grading model with `llm_factory` and instantiate the metrics you plan to use. ```python -evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) - -qa_metrics = [FactualCorrectness(llm=evaluator_llm)] +client = AsyncOpenAI() +evaluator_llm = llm_factory("gpt-4o-mini", client=client) + +qa_metrics = [ + FactualCorrectness(llm=evaluator_llm, mode="f1"), + ContextPrecisionWithReference(llm=evaluator_llm), + ContextRecall(llm=evaluator_llm), + ResponseGroundedness(llm=evaluator_llm), +] tool_metrics = [ToolCallF1()] # rule-based, no LLM required ``` - /var/folders/8k/tf3xr1rd1fl_dz35dfhfp_tc0000gn/T/ipykernel_93918/2135722072.py:1: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...')) - evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) - ## Evaluate a live AG-UI endpoint Set the endpoint URL exposed by your agent. Toggle the flags when you are ready to run the evaluations. @@ -189,6 +198,9 @@ if RUN_FACTUAL_EVAL: response reference factual_correctness(mode=f1) + context_precision_with_reference + context_recall + response_groundedness @@ -199,6 +211,9 @@ if RUN_FACTUAL_EVAL: The theory of relativity was originated by Alb... Albert Einstein originated the theory of relat... 0.33 + 0.50 + 0.75 + 0.80 1 @@ -207,6 +222,9 @@ if RUN_FACTUAL_EVAL: Penicillin was discovered by Alexander Fleming... Alexander Fleming discovered penicillin in 1928. 1.00 + 0.75 + 1.00 + 0.95 diff --git a/docs/howtos/integrations/ag_ui.md b/docs/howtos/integrations/ag_ui.md index 353a8445e..6cab35392 100644 --- a/docs/howtos/integrations/ag_ui.md +++ b/docs/howtos/integrations/ag_ui.md @@ -78,16 +78,28 @@ weather_queries = EvaluationDataset( ## Choose metrics and evaluator model -The integration works with any Ragas metric. For most text-based evaluations you will want a grading LLM. Wrap your model with the appropriate adapter (LangChain shown here, but llama-index and LiteLLM wrappers work as well). +The integration works with any Ragas metric. To unlock the modern collections portfolio, build an Instructor-compatible LLM with `llm_factory`. ```python -from ragas.metrics import FactualCorrectness, ToolCallF1 -from ragas.llms import LangchainLLMWrapper -from langchain_openai import ChatOpenAI +from openai import AsyncOpenAI +from ragas.llms import llm_factory +from ragas.metrics import ToolCallF1 +from ragas.metrics.collections import ( + ContextPrecisionWithReference, + ContextRecall, + FactualCorrectness, + ResponseGroundedness, +) -evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) +client = AsyncOpenAI() +evaluator_llm = llm_factory("gpt-4o-mini", client=client) -qa_metrics = [FactualCorrectness(llm=evaluator_llm)] +qa_metrics = [ + FactualCorrectness(llm=evaluator_llm, mode="f1"), + ContextPrecisionWithReference(llm=evaluator_llm), + ContextRecall(llm=evaluator_llm), + ResponseGroundedness(llm=evaluator_llm), +] tool_metrics = [ToolCallF1()] # rule-based metric, no LLM required ``` diff --git a/examples/ragas_examples/ag_ui_agent_evals/README.md b/examples/ragas_examples/ag_ui_agent_evals/README.md index 0846b6b48..b8f64ad32 100644 --- a/examples/ragas_examples/ag_ui_agent_evals/README.md +++ b/examples/ragas_examples/ag_ui_agent_evals/README.md @@ -36,18 +36,18 @@ Install the required dependencies: uv pip install -e ".[dev]" # Or install specific dependencies -pip install ragas langchain-openai +pip install ragas openai ``` ## Evaluation Scenarios This example includes two evaluation scenarios: -### 1. Scientist Biographies (Factual Correctness) +### 1. Scientist Biographies (Factuality & Grounding) -Tests the agent's ability to provide factually correct information about famous scientists. +Tests the agent's ability to provide factually correct information about famous scientists and ground its answers in retrieved evidence. -- **Metric**: `FactualCorrectness` - Measures how accurate the agent's responses are compared to reference answers +- **Metrics**: Collections metrics — `FactualCorrectness`, `ContextPrecisionWithReference`, `ContextRecall`, `ResponseGroundedness` - **Dataset**: `test_data/scientist_biographies.csv` - 5 questions about scientists (Einstein, Fleming, Newton, etc.) - **Sample Type**: `SingleTurnSample` - Simple question-answer pairs @@ -120,13 +120,16 @@ Evaluating against endpoint: http://localhost:8000/agentic_chat ================================================================================ Scientist Biographies Evaluation Results ================================================================================ - user_input ... factual_correctness(mode=f1) -0 Who originated the theory of relativity... ... 0.75 -1 Who discovered penicillin and when... ... 1.00 + user_input ... response_groundedness +0 Who originated the theory of relativity... ... 0.83 +1 Who discovered penicillin and when... ... 1.00 ... Average Factual Correctness: 0.7160 -Perfect scores (1.0): 2/5 +Average Context Precision: 0.6500 +Average Context Recall: 0.7200 +Average Response Groundedness: 0.7800 +Perfect factual scores (1.0): 2/5 Results saved to: .../scientist_biographies_results_20250101_143022.csv @@ -155,8 +158,8 @@ Results are saved as timestamped CSV files: Example CSV structure: ```csv -user_input,response,reference,factual_correctness(mode=f1) -"Who originated the theory of relativity...","Albert Einstein...","Albert Einstein originated...",0.75 +user_input,response,reference,factual_correctness(mode=f1),context_precision_with_reference,context_recall,response_groundedness +"Who originated the theory of relativity...","Albert Einstein...","Albert Einstein originated...",0.75,0.50,0.75,0.83 ``` ## Customizing the Evaluation @@ -183,15 +186,16 @@ user_input,reference_tool_calls ### Using Different Metrics -Modify `evals.py` to include additional Ragas metrics: +Modify `evals.py` to include additional collections metrics: ```python -from ragas.metrics import AnswerRelevancy, ContextPrecision +from ragas.metrics.collections import AnswerRelevancy, ContextPrecisionWithoutReference # In evaluate_scientist_biographies function: metrics = [ - FactualCorrectness(), - AnswerRelevancy(), # Add additional metrics + AnswerRelevancy(llm=evaluator_llm), + ContextPrecisionWithoutReference(llm=evaluator_llm), + ResponseGroundedness(llm=evaluator_llm), ] ``` diff --git a/examples/ragas_examples/ag_ui_agent_evals/__init__.py b/examples/ragas_examples/ag_ui_agent_evals/__init__.py index 7b75b49c7..b0c223b1e 100644 --- a/examples/ragas_examples/ag_ui_agent_evals/__init__.py +++ b/examples/ragas_examples/ag_ui_agent_evals/__init__.py @@ -37,11 +37,13 @@ This package includes two evaluation scenarios: -1. **Scientist Biographies** - Tests factual correctness of agent responses - using the FactualCorrectness metric with SingleTurnSample datasets. +1. **Scientist Biographies** - Uses the modern collections metrics + (`FactualCorrectness`, `ContextPrecisionWithReference`, `ContextRecall`, + `ResponseGroundedness`) with `SingleTurnSample` datasets to score factuality + and grounding in one pass. -2. **Weather Tool Usage** - Tests tool calling accuracy using the ToolCallF1 - metric with MultiTurnSample datasets. +2. **Weather Tool Usage** - Tests tool calling accuracy using the `ToolCallF1` + metric with `MultiTurnSample` datasets. ## Results diff --git a/examples/ragas_examples/ag_ui_agent_evals/evals.py b/examples/ragas_examples/ag_ui_agent_evals/evals.py index fbf822917..15bac7191 100644 --- a/examples/ragas_examples/ag_ui_agent_evals/evals.py +++ b/examples/ragas_examples/ag_ui_agent_evals/evals.py @@ -26,17 +26,23 @@ from pathlib import Path from typing import List -from langchain_openai import ChatOpenAI - +from openai import AsyncOpenAI from ragas.dataset_schema import ( EvaluationDataset, MultiTurnSample, SingleTurnSample, ) from ragas.integrations.ag_ui import evaluate_ag_ui_agent -from ragas.llms import LangchainLLMWrapper +from ragas.llms import llm_factory +from ragas.llms.base import InstructorBaseRagasLLM from ragas.messages import HumanMessage, ToolCall -from ragas.metrics import FactualCorrectness, ToolCallF1 +from ragas.metrics import ToolCallF1 +from ragas.metrics.collections import ( + ContextPrecisionWithReference, + ContextRecall, + FactualCorrectness, + ResponseGroundedness, +) # Configure logging logging.basicConfig( @@ -104,7 +110,7 @@ def load_weather_dataset() -> EvaluationDataset: async def evaluate_scientist_biographies( - endpoint_url: str, evaluator_llm: LangchainLLMWrapper + endpoint_url: str, evaluator_llm: InstructorBaseRagasLLM ) -> tuple: """ Evaluate the agent's ability to provide factually correct information @@ -125,8 +131,13 @@ async def evaluate_scientist_biographies( # Load dataset dataset = load_scientist_dataset() - # Define metrics - metrics = [FactualCorrectness()] + # Define metrics using the modern collections portfolio + metrics = [ + FactualCorrectness(llm=evaluator_llm, mode="f1"), + ContextPrecisionWithReference(llm=evaluator_llm), + ContextRecall(llm=evaluator_llm), + ResponseGroundedness(llm=evaluator_llm), + ] # Run evaluation logger.info(f"Evaluating against endpoint: {endpoint_url}") @@ -148,18 +159,26 @@ async def evaluate_scientist_biographies( logger.info(f"\nDataFrame shape: {df.shape}") logger.info(f"\n{df.to_string()}") + metric_columns = [ + "factual_correctness(mode=f1)", + "context_precision_with_reference", + "context_recall", + "response_groundedness", + ] + for column in metric_columns: + if column in df.columns: + logger.info(f"Average {column}: {df[column].mean():.4f}") + if "factual_correctness(mode=f1)" in df.columns: - avg_correctness = df["factual_correctness(mode=f1)"].mean() - logger.info(f"\nAverage Factual Correctness: {avg_correctness:.4f}") logger.info( - f"Perfect scores (1.0): {(df['factual_correctness(mode=f1)'] == 1.0).sum()}/{len(df)}" + f"Perfect factual scores (1.0): {(df['factual_correctness(mode=f1)'] == 1.0).sum()}/{len(df)}" ) return result, df async def evaluate_weather_tool_use( - endpoint_url: str, evaluator_llm: LangchainLLMWrapper + endpoint_url: str, evaluator_llm: InstructorBaseRagasLLM ) -> tuple: """ Evaluate the agent's ability to correctly call the weather tool. @@ -278,8 +297,8 @@ async def main(): # Setup evaluator LLM logger.info(f"Setting up evaluator LLM: {args.evaluator_model}") - llm = ChatOpenAI(model=args.evaluator_model) - evaluator_llm = LangchainLLMWrapper(llm) + client = AsyncOpenAI() + evaluator_llm = llm_factory(args.evaluator_model, client=client) # Run evaluations try: diff --git a/src/ragas/integrations/ag_ui.py b/src/ragas/integrations/ag_ui.py index 69bc928dd..f9bc458e5 100644 --- a/src/ragas/integrations/ag_ui.py +++ b/src/ragas/integrations/ag_ui.py @@ -31,16 +31,21 @@ from ragas.integrations.ag_ui import evaluate_ag_ui_agent from ragas.dataset_schema import EvaluationDataset, SingleTurnSample - from ragas.metrics import AspectCritic + from ragas.metrics.collections import FactualCorrectness + from ragas.llms import llm_factory + from openai import AsyncOpenAI + + client = AsyncOpenAI() + evaluator_llm = llm_factory("gpt-4o-mini", client=client) dataset = EvaluationDataset(samples=[ - SingleTurnSample(user_input="What's the weather in SF?") + SingleTurnSample(user_input="What's the weather in SF?", reference="Use the weather API") ]) result = await evaluate_ag_ui_agent( endpoint_url="http://localhost:8000/agent", dataset=dataset, - metrics=[AspectCritic()] + metrics=[FactualCorrectness(llm=evaluator_llm)] ) Evaluate with multi-turn conversations and tool calls:: @@ -66,6 +71,7 @@ from __future__ import annotations +import inspect import json import logging import math @@ -84,12 +90,53 @@ from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage from ragas.run_config import RunConfig +try: + from ragas.metrics.collections.base import BaseMetric as CollectionsBaseMetric +except ImportError: # pragma: no cover - collections are part of ragas, but guard just in case + CollectionsBaseMetric = t.cast(t.Type[object], None) + +from ragas.metrics.base import Metric, MetricType, SingleTurnMetric + if t.TYPE_CHECKING: - from ragas.metrics.base import Metric + from ragas.metrics.collections.base import BaseMetric as _CollectionsBaseMetric logger = logging.getLogger(__name__) +def _is_collections_metric(metric: Any) -> bool: + """Return True if the metric originates from the collections portfolio.""" + + return CollectionsBaseMetric is not None and isinstance(metric, CollectionsBaseMetric) + + +class _CollectionsSingleTurnMetricAdapter(SingleTurnMetric): + """Adapter that lets collections metrics participate in ragas.evaluate.""" + + def __init__(self, metric: CollectionsBaseMetric): + self._metric = metric + self.name = metric.name + self._parameter_names = [ + name + for name in inspect.signature(metric.ascore).parameters.keys() + if name != "self" + ] + required_columns = set(self._parameter_names) + self.required_columns = {MetricType.SINGLE_TURN: required_columns} + + def init(self, run_config: RunConfig) -> None: # pragma: no cover - no-op for collections + """Collections metrics manage their own initialization.""" + + async def _single_turn_ascore( + self, sample: SingleTurnSample, callbacks: Optional[Any] + ) -> float: + kwargs = {} + for param in self._parameter_names: + kwargs[param] = getattr(sample, param, None) + + result = await self._metric.ascore(**kwargs) + return result.value + + # Lazy imports for ag_ui to avoid hard dependency def _import_ag_ui_core(): """Import AG-UI core types with helpful error message.""" @@ -1035,10 +1082,34 @@ async def _call_ag_ui_endpoint( return events +def _prepare_metrics_for_evaluation( + metrics: t.Sequence[Union[Metric, "_CollectionsBaseMetric"]], + is_multi_turn: bool, +) -> t.List[Metric]: + """Normalize metrics so ragas.evaluate can consume them.""" + + prepared: t.List[Metric] = [] + for metric in metrics: + if isinstance(metric, Metric): + prepared.append(metric) + elif _is_collections_metric(metric): + if is_multi_turn: + raise ValueError( + "Collections metrics currently support only single-turn datasets in the AG-UI integration." + ) + prepared.append(_CollectionsSingleTurnMetricAdapter(metric)) + else: + raise TypeError( + "Metrics must be Ragas Metric instances or collections metrics." + ) + + return prepared + + async def evaluate_ag_ui_agent( endpoint_url: str, dataset: EvaluationDataset, - metrics: List["Metric"], + metrics: List[Union[Metric, "_CollectionsBaseMetric"]], metadata: bool = False, run_config: Optional[RunConfig] = None, batch_size: Optional[int] = None, @@ -1069,8 +1140,8 @@ async def evaluate_ag_ui_agent( Dataset containing test queries. Can contain either: - SingleTurnSample: user_input as string - MultiTurnSample: user_input as list of messages - metrics : List[Metric] - List of Ragas metrics to evaluate (e.g., AspectCritic, ToolCallF1). + metrics : List[Metric or collections.BaseMetric] + List of Ragas metrics to evaluate (e.g., ResponseGroundedness, ToolCallF1). metadata : bool, optional Whether to include AG-UI metadata in converted messages (default: False). run_config : RunConfig, optional @@ -1107,8 +1178,15 @@ async def evaluate_ag_ui_agent( >>> from ragas.integrations.ag_ui import evaluate_ag_ui_agent >>> from ragas.dataset_schema import EvaluationDataset, SingleTurnSample - >>> from ragas.metrics import AspectCritic, Faithfulness + >>> from ragas.metrics.collections import ( + ... ContextPrecisionWithReference, + ... FactualCorrectness, + ... ) + >>> from ragas.llms import llm_factory + >>> from openai import AsyncOpenAI >>> + >>> client = AsyncOpenAI() + >>> evaluator_llm = llm_factory("gpt-4o-mini", client=client) >>> dataset = EvaluationDataset(samples=[ ... SingleTurnSample( ... user_input="What's the weather in San Francisco?", @@ -1119,7 +1197,10 @@ async def evaluate_ag_ui_agent( >>> result = await evaluate_ag_ui_agent( ... endpoint_url="http://localhost:8000/agent", ... dataset=dataset, - ... metrics=[AspectCritic(), Faithfulness()] + ... metrics=[ + ... FactualCorrectness(llm=evaluator_llm), + ... ContextPrecisionWithReference(llm=evaluator_llm), + ... ] ... ) With AG-UI metadata included:: @@ -1127,7 +1208,7 @@ async def evaluate_ag_ui_agent( >>> result = await evaluate_ag_ui_agent( ... endpoint_url="http://localhost:8000/agent", ... dataset=dataset, - ... metrics=[AspectCritic()], + ... metrics=[FactualCorrectness(llm=evaluator_llm)], ... metadata=True # Include run_id, thread_id, etc. ... ) @@ -1174,6 +1255,7 @@ async def evaluate_ag_ui_agent( # Support both single-turn and multi-turn evaluations is_multi_turn = dataset.is_multi_turn() + prepared_metrics = _prepare_metrics_for_evaluation(metrics, is_multi_turn) if is_multi_turn: samples = t.cast(List[MultiTurnSample], dataset.samples) else: @@ -1300,7 +1382,7 @@ async def evaluate_ag_ui_agent( # Run evaluation with metrics evaluation_result = ragas_evaluate( dataset=dataset, - metrics=metrics, + metrics=prepared_metrics, raise_exceptions=raise_exceptions, show_progress=show_progress, run_config=run_config or RunConfig(),