contextablemark · contextablemark · Nov 12, 2025
diff --git a/docs/howtos/integrations/_ag_ui.md b/docs/howtos/integrations/_ag_ui.md
@@ -26,7 +26,7 @@ import asyncio
 from dotenv import load_dotenv
 import nest_asyncio
 from IPython.display import display
-from langchain_openai import ChatOpenAI
+from openai import AsyncOpenAI
 
 from ragas.dataset_schema import EvaluationDataset, SingleTurnSample, MultiTurnSample
 from ragas.integrations.ag_ui import (
@@ -35,8 +35,14 @@ from ragas.integrations.ag_ui import (
     convert_messages_snapshot,
 )
 from ragas.messages import HumanMessage, ToolCall
-from ragas.metrics import FactualCorrectness, ToolCallF1
-from ragas.llms import LangchainLLMWrapper
+from ragas.metrics import ToolCallF1
+from ragas.metrics.collections import (
+    ContextPrecisionWithReference,
+    ContextRecall,
+    FactualCorrectness,
+    ResponseGroundedness,
+)
+from ragas.llms import llm_factory
 from ag_ui.core import (
     MessagesSnapshotEvent,
     TextMessageChunkEvent,
@@ -109,21 +115,24 @@ weather_queries
 
 
 ## Configure metrics and the evaluator LLM
-Wrap your grading model with the appropriate adapter and instantiate the metrics you plan to use.
+Create an Instructor-compatible grading model with `llm_factory` and instantiate the metrics you plan to use.
 
 
 
 ```python
-evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
-
-qa_metrics = [FactualCorrectness(llm=evaluator_llm)]
+client = AsyncOpenAI()
+evaluator_llm = llm_factory("gpt-4o-mini", client=client)
+
+qa_metrics = [
+    FactualCorrectness(llm=evaluator_llm, mode="f1"),
+    ContextPrecisionWithReference(llm=evaluator_llm),
+    ContextRecall(llm=evaluator_llm),
+    ResponseGroundedness(llm=evaluator_llm),
+]
 tool_metrics = [ToolCallF1()]  # rule-based, no LLM required
 
 ```
 
-    /var/folders/8k/tf3xr1rd1fl_dz35dfhfp_tc0000gn/T/ipykernel_93918/2135722072.py:1: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))
-      evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
-
 
 ## Evaluate a live AG-UI endpoint
 Set the endpoint URL exposed by your agent. Toggle the flags when you are ready to run the evaluations.
@@ -189,6 +198,9 @@ if RUN_FACTUAL_EVAL:
       <th>response</th>
       <th>reference</th>
       <th>factual_correctness(mode=f1)</th>
+      <th>context_precision_with_reference</th>
+      <th>context_recall</th>
+      <th>response_groundedness</th>
     </tr>
   </thead>
   <tbody>
@@ -199,6 +211,9 @@ if RUN_FACTUAL_EVAL:
       <td>The theory of relativity was originated by Alb...</td>
       <td>Albert Einstein originated the theory of relat...</td>
       <td>0.33</td>
+      <td>0.50</td>
+      <td>0.75</td>
+      <td>0.80</td>
     </tr>
     <tr>
       <th>1</th>
@@ -207,6 +222,9 @@ if RUN_FACTUAL_EVAL:
       <td>Penicillin was discovered by Alexander Fleming...</td>
       <td>Alexander Fleming discovered penicillin in 1928.</td>
       <td>1.00</td>
+      <td>0.75</td>
+      <td>1.00</td>
+      <td>0.95</td>
     </tr>
   </tbody>
 </table>

diff --git a/docs/howtos/integrations/ag_ui.md b/docs/howtos/integrations/ag_ui.md
@@ -78,16 +78,28 @@ weather_queries = EvaluationDataset(
 
 ## Choose metrics and evaluator model
 
-The integration works with any Ragas metric. For most text-based evaluations you will want a grading LLM. Wrap your model with the appropriate adapter (LangChain shown here, but llama-index and LiteLLM wrappers work as well).
+The integration works with any Ragas metric. To unlock the modern collections portfolio, build an Instructor-compatible LLM with `llm_factory`.
 
 ```python
-from ragas.metrics import FactualCorrectness, ToolCallF1
-from ragas.llms import LangchainLLMWrapper
-from langchain_openai import ChatOpenAI
+from openai import AsyncOpenAI
+from ragas.llms import llm_factory
+from ragas.metrics import ToolCallF1
+from ragas.metrics.collections import (
+    ContextPrecisionWithReference,
+    ContextRecall,
+    FactualCorrectness,
+    ResponseGroundedness,
+)
 
-evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
+client = AsyncOpenAI()
+evaluator_llm = llm_factory("gpt-4o-mini", client=client)
 
-qa_metrics = [FactualCorrectness(llm=evaluator_llm)]
+qa_metrics = [
+    FactualCorrectness(llm=evaluator_llm, mode="f1"),
+    ContextPrecisionWithReference(llm=evaluator_llm),
+    ContextRecall(llm=evaluator_llm),
+    ResponseGroundedness(llm=evaluator_llm),
+]
 tool_metrics = [ToolCallF1()]  # rule-based metric, no LLM required
 ```
 

diff --git a/examples/ragas_examples/ag_ui_agent_evals/README.md b/examples/ragas_examples/ag_ui_agent_evals/README.md
@@ -36,18 +36,18 @@ Install the required dependencies:
 uv pip install -e ".[dev]"
 
 # Or install specific dependencies
-pip install ragas langchain-openai
+pip install ragas openai
 ```
 
 ## Evaluation Scenarios
 
 This example includes two evaluation scenarios:
 
-### 1. Scientist Biographies (Factual Correctness)
+### 1. Scientist Biographies (Factuality & Grounding)
 
-Tests the agent's ability to provide factually correct information about famous scientists.
+Tests the agent's ability to provide factually correct information about famous scientists and ground its answers in retrieved evidence.
 
-- **Metric**: `FactualCorrectness` - Measures how accurate the agent's responses are compared to reference answers
+- **Metrics**: Collections metrics — `FactualCorrectness`, `ContextPrecisionWithReference`, `ContextRecall`, `ResponseGroundedness`
 - **Dataset**: `test_data/scientist_biographies.csv` - 5 questions about scientists (Einstein, Fleming, Newton, etc.)
 - **Sample Type**: `SingleTurnSample` - Simple question-answer pairs
 
@@ -120,13 +120,16 @@ Evaluating against endpoint: http://localhost:8000/agentic_chat
 ================================================================================
 Scientist Biographies Evaluation Results
 ================================================================================
-                                          user_input  ...  factual_correctness(mode=f1)
-0  Who originated the theory of relativity...     ...                          0.75
-1  Who discovered penicillin and when...           ...                          1.00
+                                          user_input  ...  response_groundedness
+0  Who originated the theory of relativity...     ...                   0.83
+1  Who discovered penicillin and when...           ...                   1.00
 ...
 
 Average Factual Correctness: 0.7160
-Perfect scores (1.0): 2/5
+Average Context Precision: 0.6500
+Average Context Recall: 0.7200
+Average Response Groundedness: 0.7800
+Perfect factual scores (1.0): 2/5
 
 Results saved to: .../scientist_biographies_results_20250101_143022.csv
 
@@ -155,8 +158,8 @@ Results are saved as timestamped CSV files:
 Example CSV structure:
 
 ```csv
-user_input,response,reference,factual_correctness(mode=f1)
-"Who originated the theory of relativity...","Albert Einstein...","Albert Einstein originated...",0.75
+user_input,response,reference,factual_correctness(mode=f1),context_precision_with_reference,context_recall,response_groundedness
+"Who originated the theory of relativity...","Albert Einstein...","Albert Einstein originated...",0.75,0.50,0.75,0.83
 ```
 
 ## Customizing the Evaluation
@@ -183,15 +186,16 @@ user_input,reference_tool_calls
 
 ### Using Different Metrics
 
-Modify `evals.py` to include additional Ragas metrics:
+Modify `evals.py` to include additional collections metrics:
 
 ```python
-from ragas.metrics import AnswerRelevancy, ContextPrecision
+from ragas.metrics.collections import AnswerRelevancy, ContextPrecisionWithoutReference
 
 # In evaluate_scientist_biographies function:
 metrics = [
-    FactualCorrectness(),
-    AnswerRelevancy(),  # Add additional metrics
+    AnswerRelevancy(llm=evaluator_llm),
+    ContextPrecisionWithoutReference(llm=evaluator_llm),
+    ResponseGroundedness(llm=evaluator_llm),
 ]
 ```
 

diff --git a/examples/ragas_examples/ag_ui_agent_evals/__init__.py b/examples/ragas_examples/ag_ui_agent_evals/__init__.py
@@ -37,11 +37,13 @@
 
 This package includes two evaluation scenarios:
 
-1. **Scientist Biographies** - Tests factual correctness of agent responses
-   using the FactualCorrectness metric with SingleTurnSample datasets.
+1. **Scientist Biographies** - Uses the modern collections metrics
+   (`FactualCorrectness`, `ContextPrecisionWithReference`, `ContextRecall`,
+   `ResponseGroundedness`) with `SingleTurnSample` datasets to score factuality
+   and grounding in one pass.
 
-2. **Weather Tool Usage** - Tests tool calling accuracy using the ToolCallF1
-   metric with MultiTurnSample datasets.
+2. **Weather Tool Usage** - Tests tool calling accuracy using the `ToolCallF1`
+   metric with `MultiTurnSample` datasets.
 
 ## Results
 

diff --git a/examples/ragas_examples/ag_ui_agent_evals/evals.py b/examples/ragas_examples/ag_ui_agent_evals/evals.py
@@ -26,17 +26,23 @@
 from pathlib import Path
 from typing import List
 
-from langchain_openai import ChatOpenAI
-
+from openai import AsyncOpenAI
 from ragas.dataset_schema import (
     EvaluationDataset,
     MultiTurnSample,
     SingleTurnSample,
 )
 from ragas.integrations.ag_ui import evaluate_ag_ui_agent
-from ragas.llms import LangchainLLMWrapper
+from ragas.llms import llm_factory
+from ragas.llms.base import InstructorBaseRagasLLM
 from ragas.messages import HumanMessage, ToolCall
-from ragas.metrics import FactualCorrectness, ToolCallF1
+from ragas.metrics import ToolCallF1
+from ragas.metrics.collections import (
+    ContextPrecisionWithReference,
+    ContextRecall,
+    FactualCorrectness,
+    ResponseGroundedness,
+)
 
 # Configure logging
 logging.basicConfig(
@@ -104,7 +110,7 @@ def load_weather_dataset() -> EvaluationDataset:
 
 
 async def evaluate_scientist_biographies(
-    endpoint_url: str, evaluator_llm: LangchainLLMWrapper
+    endpoint_url: str, evaluator_llm: InstructorBaseRagasLLM
 ) -> tuple:
     """
     Evaluate the agent's ability to provide factually correct information
@@ -125,8 +131,13 @@ async def evaluate_scientist_biographies(
     # Load dataset
     dataset = load_scientist_dataset()
 
-    # Define metrics
-    metrics = [FactualCorrectness()]
+    # Define metrics using the modern collections portfolio
+    metrics = [
+        FactualCorrectness(llm=evaluator_llm, mode="f1"),
+        ContextPrecisionWithReference(llm=evaluator_llm),
+        ContextRecall(llm=evaluator_llm),
+        ResponseGroundedness(llm=evaluator_llm),
+    ]
 
     # Run evaluation
     logger.info(f"Evaluating against endpoint: {endpoint_url}")
@@ -148,18 +159,26 @@ async def evaluate_scientist_biographies(
     logger.info(f"\nDataFrame shape: {df.shape}")
     logger.info(f"\n{df.to_string()}")
 
+    metric_columns = [
+        "factual_correctness(mode=f1)",
+        "context_precision_with_reference",
+        "context_recall",
+        "response_groundedness",
+    ]
+    for column in metric_columns:
+        if column in df.columns:
+            logger.info(f"Average {column}: {df[column].mean():.4f}")
+
     if "factual_correctness(mode=f1)" in df.columns:
-        avg_correctness = df["factual_correctness(mode=f1)"].mean()
-        logger.info(f"\nAverage Factual Correctness: {avg_correctness:.4f}")
         logger.info(
-            f"Perfect scores (1.0): {(df['factual_correctness(mode=f1)'] == 1.0).sum()}/{len(df)}"
+            f"Perfect factual scores (1.0): {(df['factual_correctness(mode=f1)'] == 1.0).sum()}/{len(df)}"
         )
 
     return result, df
 
 
 async def evaluate_weather_tool_use(
-    endpoint_url: str, evaluator_llm: LangchainLLMWrapper
+    endpoint_url: str, evaluator_llm: InstructorBaseRagasLLM
 ) -> tuple:
     """
     Evaluate the agent's ability to correctly call the weather tool.
@@ -278,8 +297,8 @@ async def main():
 
     # Setup evaluator LLM
     logger.info(f"Setting up evaluator LLM: {args.evaluator_model}")
-    llm = ChatOpenAI(model=args.evaluator_model)
-    evaluator_llm = LangchainLLMWrapper(llm)
+    client = AsyncOpenAI()
+    evaluator_llm = llm_factory(args.evaluator_model, client=client)
 
     # Run evaluations
     try: