diff --git a/docs/howtos/integrations/_ag_ui.md b/docs/howtos/integrations/_ag_ui.md
index cf9e056a5..4d3308346 100644
--- a/docs/howtos/integrations/_ag_ui.md
+++ b/docs/howtos/integrations/_ag_ui.md
@@ -26,7 +26,7 @@ import asyncio
 from dotenv import load_dotenv
 import nest_asyncio
 from IPython.display import display
-from langchain_openai import ChatOpenAI
+from openai import AsyncOpenAI
 
 from ragas.dataset_schema import EvaluationDataset, SingleTurnSample, MultiTurnSample
 from ragas.integrations.ag_ui import (
@@ -35,8 +35,14 @@ from ragas.integrations.ag_ui import (
     convert_messages_snapshot,
 )
 from ragas.messages import HumanMessage, ToolCall
-from ragas.metrics import FactualCorrectness, ToolCallF1
-from ragas.llms import LangchainLLMWrapper
+from ragas.metrics import ToolCallF1
+from ragas.metrics.collections import (
+    ContextPrecisionWithReference,
+    ContextRecall,
+    FactualCorrectness,
+    ResponseGroundedness,
+)
+from ragas.llms import llm_factory
 from ag_ui.core import (
     MessagesSnapshotEvent,
     TextMessageChunkEvent,
@@ -109,21 +115,24 @@ weather_queries
 
 
 ## Configure metrics and the evaluator LLM
-Wrap your grading model with the appropriate adapter and instantiate the metrics you plan to use.
+Create an Instructor-compatible grading model with `llm_factory` and instantiate the metrics you plan to use.
 
 
 
 ```python
-evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
-
-qa_metrics = [FactualCorrectness(llm=evaluator_llm)]
+client = AsyncOpenAI()
+evaluator_llm = llm_factory("gpt-4o-mini", client=client)
+
+qa_metrics = [
+    FactualCorrectness(llm=evaluator_llm, mode="f1"),
+    ContextPrecisionWithReference(llm=evaluator_llm),
+    ContextRecall(llm=evaluator_llm),
+    ResponseGroundedness(llm=evaluator_llm),
+]
 tool_metrics = [ToolCallF1()]  # rule-based, no LLM required
 
 ```
 
-    /var/folders/8k/tf3xr1rd1fl_dz35dfhfp_tc0000gn/T/ipykernel_93918/2135722072.py:1: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))
-      evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
-
 
 ## Evaluate a live AG-UI endpoint
 Set the endpoint URL exposed by your agent. Toggle the flags when you are ready to run the evaluations.
@@ -189,6 +198,9 @@ if RUN_FACTUAL_EVAL:
       <th>response</th>
       <th>reference</th>
       <th>factual_correctness(mode=f1)</th>
+      <th>context_precision_with_reference</th>
+      <th>context_recall</th>
+      <th>response_groundedness</th>
     </tr>
   </thead>
   <tbody>
@@ -199,6 +211,9 @@ if RUN_FACTUAL_EVAL:
       <td>The theory of relativity was originated by Alb...</td>
       <td>Albert Einstein originated the theory of relat...</td>
       <td>0.33</td>
+      <td>0.50</td>
+      <td>0.75</td>
+      <td>0.80</td>
     </tr>
     <tr>
       <th>1</th>
@@ -207,6 +222,9 @@ if RUN_FACTUAL_EVAL:
       <td>Penicillin was discovered by Alexander Fleming...</td>
       <td>Alexander Fleming discovered penicillin in 1928.</td>
       <td>1.00</td>
+      <td>0.75</td>
+      <td>1.00</td>
+      <td>0.95</td>
     </tr>
   </tbody>
 </table>
diff --git a/docs/howtos/integrations/ag_ui.md b/docs/howtos/integrations/ag_ui.md
index 353a8445e..6cab35392 100644
--- a/docs/howtos/integrations/ag_ui.md
+++ b/docs/howtos/integrations/ag_ui.md
@@ -78,16 +78,28 @@ weather_queries = EvaluationDataset(
 
 ## Choose metrics and evaluator model
 
-The integration works with any Ragas metric. For most text-based evaluations you will want a grading LLM. Wrap your model with the appropriate adapter (LangChain shown here, but llama-index and LiteLLM wrappers work as well).
+The integration works with any Ragas metric. To unlock the modern collections portfolio, build an Instructor-compatible LLM with `llm_factory`.
 
 ```python
-from ragas.metrics import FactualCorrectness, ToolCallF1
-from ragas.llms import LangchainLLMWrapper
-from langchain_openai import ChatOpenAI
+from openai import AsyncOpenAI
+from ragas.llms import llm_factory
+from ragas.metrics import ToolCallF1
+from ragas.metrics.collections import (
+    ContextPrecisionWithReference,
+    ContextRecall,
+    FactualCorrectness,
+    ResponseGroundedness,
+)
 
-evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
+client = AsyncOpenAI()
+evaluator_llm = llm_factory("gpt-4o-mini", client=client)
 
-qa_metrics = [FactualCorrectness(llm=evaluator_llm)]
+qa_metrics = [
+    FactualCorrectness(llm=evaluator_llm, mode="f1"),
+    ContextPrecisionWithReference(llm=evaluator_llm),
+    ContextRecall(llm=evaluator_llm),
+    ResponseGroundedness(llm=evaluator_llm),
+]
 tool_metrics = [ToolCallF1()]  # rule-based metric, no LLM required
 ```
 
diff --git a/examples/ragas_examples/ag_ui_agent_evals/README.md b/examples/ragas_examples/ag_ui_agent_evals/README.md
index 0846b6b48..b8f64ad32 100644
--- a/examples/ragas_examples/ag_ui_agent_evals/README.md
+++ b/examples/ragas_examples/ag_ui_agent_evals/README.md
@@ -36,18 +36,18 @@ Install the required dependencies:
 uv pip install -e ".[dev]"
 
 # Or install specific dependencies
-pip install ragas langchain-openai
+pip install ragas openai
 ```
 
 ## Evaluation Scenarios
 
 This example includes two evaluation scenarios:
 
-### 1. Scientist Biographies (Factual Correctness)
+### 1. Scientist Biographies (Factuality & Grounding)
 
-Tests the agent's ability to provide factually correct information about famous scientists.
+Tests the agent's ability to provide factually correct information about famous scientists and ground its answers in retrieved evidence.
 
-- **Metric**: `FactualCorrectness` - Measures how accurate the agent's responses are compared to reference answers
+- **Metrics**: Collections metrics — `FactualCorrectness`, `ContextPrecisionWithReference`, `ContextRecall`, `ResponseGroundedness`
 - **Dataset**: `test_data/scientist_biographies.csv` - 5 questions about scientists (Einstein, Fleming, Newton, etc.)
 - **Sample Type**: `SingleTurnSample` - Simple question-answer pairs
 
@@ -120,13 +120,16 @@ Evaluating against endpoint: http://localhost:8000/agentic_chat
 ================================================================================
 Scientist Biographies Evaluation Results
 ================================================================================
-                                          user_input  ...  factual_correctness(mode=f1)
-0  Who originated the theory of relativity...     ...                          0.75
-1  Who discovered penicillin and when...           ...                          1.00
+                                          user_input  ...  response_groundedness
+0  Who originated the theory of relativity...     ...                   0.83
+1  Who discovered penicillin and when...           ...                   1.00
 ...
 
 Average Factual Correctness: 0.7160
-Perfect scores (1.0): 2/5
+Average Context Precision: 0.6500
+Average Context Recall: 0.7200
+Average Response Groundedness: 0.7800
+Perfect factual scores (1.0): 2/5
 
 Results saved to: .../scientist_biographies_results_20250101_143022.csv
 
@@ -155,8 +158,8 @@ Results are saved as timestamped CSV files:
 Example CSV structure:
 
 ```csv
-user_input,response,reference,factual_correctness(mode=f1)
-"Who originated the theory of relativity...","Albert Einstein...","Albert Einstein originated...",0.75
+user_input,response,reference,factual_correctness(mode=f1),context_precision_with_reference,context_recall,response_groundedness
+"Who originated the theory of relativity...","Albert Einstein...","Albert Einstein originated...",0.75,0.50,0.75,0.83
 ```
 
 ## Customizing the Evaluation
@@ -183,15 +186,16 @@ user_input,reference_tool_calls
 
 ### Using Different Metrics
 
-Modify `evals.py` to include additional Ragas metrics:
+Modify `evals.py` to include additional collections metrics:
 
 ```python
-from ragas.metrics import AnswerRelevancy, ContextPrecision
+from ragas.metrics.collections import AnswerRelevancy, ContextPrecisionWithoutReference
 
 # In evaluate_scientist_biographies function:
 metrics = [
-    FactualCorrectness(),
-    AnswerRelevancy(),  # Add additional metrics
+    AnswerRelevancy(llm=evaluator_llm),
+    ContextPrecisionWithoutReference(llm=evaluator_llm),
+    ResponseGroundedness(llm=evaluator_llm),
 ]
 ```
 
diff --git a/examples/ragas_examples/ag_ui_agent_evals/__init__.py b/examples/ragas_examples/ag_ui_agent_evals/__init__.py
index 7b75b49c7..b0c223b1e 100644
--- a/examples/ragas_examples/ag_ui_agent_evals/__init__.py
+++ b/examples/ragas_examples/ag_ui_agent_evals/__init__.py
@@ -37,11 +37,13 @@
 
 This package includes two evaluation scenarios:
 
-1. **Scientist Biographies** - Tests factual correctness of agent responses
-   using the FactualCorrectness metric with SingleTurnSample datasets.
+1. **Scientist Biographies** - Uses the modern collections metrics
+   (`FactualCorrectness`, `ContextPrecisionWithReference`, `ContextRecall`,
+   `ResponseGroundedness`) with `SingleTurnSample` datasets to score factuality
+   and grounding in one pass.
 
-2. **Weather Tool Usage** - Tests tool calling accuracy using the ToolCallF1
-   metric with MultiTurnSample datasets.
+2. **Weather Tool Usage** - Tests tool calling accuracy using the `ToolCallF1`
+   metric with `MultiTurnSample` datasets.
 
 ## Results
 
diff --git a/examples/ragas_examples/ag_ui_agent_evals/evals.py b/examples/ragas_examples/ag_ui_agent_evals/evals.py
index fbf822917..15bac7191 100644
--- a/examples/ragas_examples/ag_ui_agent_evals/evals.py
+++ b/examples/ragas_examples/ag_ui_agent_evals/evals.py
@@ -26,17 +26,23 @@
 from pathlib import Path
 from typing import List
 
-from langchain_openai import ChatOpenAI
-
+from openai import AsyncOpenAI
 from ragas.dataset_schema import (
     EvaluationDataset,
     MultiTurnSample,
     SingleTurnSample,
 )
 from ragas.integrations.ag_ui import evaluate_ag_ui_agent
-from ragas.llms import LangchainLLMWrapper
+from ragas.llms import llm_factory
+from ragas.llms.base import InstructorBaseRagasLLM
 from ragas.messages import HumanMessage, ToolCall
-from ragas.metrics import FactualCorrectness, ToolCallF1
+from ragas.metrics import ToolCallF1
+from ragas.metrics.collections import (
+    ContextPrecisionWithReference,
+    ContextRecall,
+    FactualCorrectness,
+    ResponseGroundedness,
+)
 
 # Configure logging
 logging.basicConfig(
@@ -104,7 +110,7 @@ def load_weather_dataset() -> EvaluationDataset:
 
 
 async def evaluate_scientist_biographies(
-    endpoint_url: str, evaluator_llm: LangchainLLMWrapper
+    endpoint_url: str, evaluator_llm: InstructorBaseRagasLLM
 ) -> tuple:
     """
     Evaluate the agent's ability to provide factually correct information
@@ -125,8 +131,13 @@ async def evaluate_scientist_biographies(
     # Load dataset
     dataset = load_scientist_dataset()
 
-    # Define metrics
-    metrics = [FactualCorrectness()]
+    # Define metrics using the modern collections portfolio
+    metrics = [
+        FactualCorrectness(llm=evaluator_llm, mode="f1"),
+        ContextPrecisionWithReference(llm=evaluator_llm),
+        ContextRecall(llm=evaluator_llm),
+        ResponseGroundedness(llm=evaluator_llm),
+    ]
 
     # Run evaluation
     logger.info(f"Evaluating against endpoint: {endpoint_url}")
@@ -148,18 +159,26 @@ async def evaluate_scientist_biographies(
     logger.info(f"\nDataFrame shape: {df.shape}")
     logger.info(f"\n{df.to_string()}")
 
+    metric_columns = [
+        "factual_correctness(mode=f1)",
+        "context_precision_with_reference",
+        "context_recall",
+        "response_groundedness",
+    ]
+    for column in metric_columns:
+        if column in df.columns:
+            logger.info(f"Average {column}: {df[column].mean():.4f}")
+
     if "factual_correctness(mode=f1)" in df.columns:
-        avg_correctness = df["factual_correctness(mode=f1)"].mean()
-        logger.info(f"\nAverage Factual Correctness: {avg_correctness:.4f}")
         logger.info(
-            f"Perfect scores (1.0): {(df['factual_correctness(mode=f1)'] == 1.0).sum()}/{len(df)}"
+            f"Perfect factual scores (1.0): {(df['factual_correctness(mode=f1)'] == 1.0).sum()}/{len(df)}"
         )
 
     return result, df
 
 
 async def evaluate_weather_tool_use(
-    endpoint_url: str, evaluator_llm: LangchainLLMWrapper
+    endpoint_url: str, evaluator_llm: InstructorBaseRagasLLM
 ) -> tuple:
     """
     Evaluate the agent's ability to correctly call the weather tool.
@@ -278,8 +297,8 @@ async def main():
 
     # Setup evaluator LLM
     logger.info(f"Setting up evaluator LLM: {args.evaluator_model}")
-    llm = ChatOpenAI(model=args.evaluator_model)
-    evaluator_llm = LangchainLLMWrapper(llm)
+    client = AsyncOpenAI()
+    evaluator_llm = llm_factory(args.evaluator_model, client=client)
 
     # Run evaluations
     try:
diff --git a/src/ragas/integrations/ag_ui.py b/src/ragas/integrations/ag_ui.py
index 69bc928dd..f9bc458e5 100644
--- a/src/ragas/integrations/ag_ui.py
+++ b/src/ragas/integrations/ag_ui.py
@@ -31,16 +31,21 @@
 
         from ragas.integrations.ag_ui import evaluate_ag_ui_agent
         from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
-        from ragas.metrics import AspectCritic
+        from ragas.metrics.collections import FactualCorrectness
+        from ragas.llms import llm_factory
+        from openai import AsyncOpenAI
+
+        client = AsyncOpenAI()
+        evaluator_llm = llm_factory("gpt-4o-mini", client=client)
 
         dataset = EvaluationDataset(samples=[
-            SingleTurnSample(user_input="What's the weather in SF?")
+            SingleTurnSample(user_input="What's the weather in SF?", reference="Use the weather API")
         ])
 
         result = await evaluate_ag_ui_agent(
             endpoint_url="http://localhost:8000/agent",
             dataset=dataset,
-            metrics=[AspectCritic()]
+            metrics=[FactualCorrectness(llm=evaluator_llm)]
         )
 
     Evaluate with multi-turn conversations and tool calls::
@@ -66,6 +71,7 @@
 
 from __future__ import annotations
 
+import inspect
 import json
 import logging
 import math
@@ -84,12 +90,53 @@
 from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
 from ragas.run_config import RunConfig
 
+try:
+    from ragas.metrics.collections.base import BaseMetric as CollectionsBaseMetric
+except ImportError:  # pragma: no cover - collections are part of ragas, but guard just in case
+    CollectionsBaseMetric = t.cast(t.Type[object], None)
+
+from ragas.metrics.base import Metric, MetricType, SingleTurnMetric
+
 if t.TYPE_CHECKING:
-    from ragas.metrics.base import Metric
+    from ragas.metrics.collections.base import BaseMetric as _CollectionsBaseMetric
 
 logger = logging.getLogger(__name__)
 
 
+def _is_collections_metric(metric: Any) -> bool:
+    """Return True if the metric originates from the collections portfolio."""
+
+    return CollectionsBaseMetric is not None and isinstance(metric, CollectionsBaseMetric)
+
+
+class _CollectionsSingleTurnMetricAdapter(SingleTurnMetric):
+    """Adapter that lets collections metrics participate in ragas.evaluate."""
+
+    def __init__(self, metric: CollectionsBaseMetric):
+        self._metric = metric
+        self.name = metric.name
+        self._parameter_names = [
+            name
+            for name in inspect.signature(metric.ascore).parameters.keys()
+            if name != "self"
+        ]
+        required_columns = set(self._parameter_names)
+        self.required_columns = {MetricType.SINGLE_TURN: required_columns}
+
+    def init(self, run_config: RunConfig) -> None:  # pragma: no cover - no-op for collections
+        """Collections metrics manage their own initialization."""
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: Optional[Any]
+    ) -> float:
+        kwargs = {}
+        for param in self._parameter_names:
+            kwargs[param] = getattr(sample, param, None)
+
+        result = await self._metric.ascore(**kwargs)
+        return result.value
+
+
 # Lazy imports for ag_ui to avoid hard dependency
 def _import_ag_ui_core():
     """Import AG-UI core types with helpful error message."""
@@ -1035,10 +1082,34 @@ async def _call_ag_ui_endpoint(
     return events
 
 
+def _prepare_metrics_for_evaluation(
+    metrics: t.Sequence[Union[Metric, "_CollectionsBaseMetric"]],
+    is_multi_turn: bool,
+) -> t.List[Metric]:
+    """Normalize metrics so ragas.evaluate can consume them."""
+
+    prepared: t.List[Metric] = []
+    for metric in metrics:
+        if isinstance(metric, Metric):
+            prepared.append(metric)
+        elif _is_collections_metric(metric):
+            if is_multi_turn:
+                raise ValueError(
+                    "Collections metrics currently support only single-turn datasets in the AG-UI integration."
+                )
+            prepared.append(_CollectionsSingleTurnMetricAdapter(metric))
+        else:
+            raise TypeError(
+                "Metrics must be Ragas Metric instances or collections metrics."
+            )
+
+    return prepared
+
+
 async def evaluate_ag_ui_agent(
     endpoint_url: str,
     dataset: EvaluationDataset,
-    metrics: List["Metric"],
+    metrics: List[Union[Metric, "_CollectionsBaseMetric"]],
     metadata: bool = False,
     run_config: Optional[RunConfig] = None,
     batch_size: Optional[int] = None,
@@ -1069,8 +1140,8 @@ async def evaluate_ag_ui_agent(
         Dataset containing test queries. Can contain either:
         - SingleTurnSample: user_input as string
         - MultiTurnSample: user_input as list of messages
-    metrics : List[Metric]
-        List of Ragas metrics to evaluate (e.g., AspectCritic, ToolCallF1).
+    metrics : List[Metric or collections.BaseMetric]
+        List of Ragas metrics to evaluate (e.g., ResponseGroundedness, ToolCallF1).
     metadata : bool, optional
         Whether to include AG-UI metadata in converted messages (default: False).
     run_config : RunConfig, optional
@@ -1107,8 +1178,15 @@ async def evaluate_ag_ui_agent(
 
         >>> from ragas.integrations.ag_ui import evaluate_ag_ui_agent
         >>> from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
-        >>> from ragas.metrics import AspectCritic, Faithfulness
+        >>> from ragas.metrics.collections import (
+        ...     ContextPrecisionWithReference,
+        ...     FactualCorrectness,
+        ... )
+        >>> from ragas.llms import llm_factory
+        >>> from openai import AsyncOpenAI
         >>>
+        >>> client = AsyncOpenAI()
+        >>> evaluator_llm = llm_factory("gpt-4o-mini", client=client)
         >>> dataset = EvaluationDataset(samples=[
         ...     SingleTurnSample(
         ...         user_input="What's the weather in San Francisco?",
@@ -1119,7 +1197,10 @@ async def evaluate_ag_ui_agent(
         >>> result = await evaluate_ag_ui_agent(
         ...     endpoint_url="http://localhost:8000/agent",
         ...     dataset=dataset,
-        ...     metrics=[AspectCritic(), Faithfulness()]
+        ...     metrics=[
+        ...         FactualCorrectness(llm=evaluator_llm),
+        ...         ContextPrecisionWithReference(llm=evaluator_llm),
+        ...     ]
         ... )
 
     With AG-UI metadata included::
@@ -1127,7 +1208,7 @@ async def evaluate_ag_ui_agent(
         >>> result = await evaluate_ag_ui_agent(
         ...     endpoint_url="http://localhost:8000/agent",
         ...     dataset=dataset,
-        ...     metrics=[AspectCritic()],
+        ...     metrics=[FactualCorrectness(llm=evaluator_llm)],
         ...     metadata=True  # Include run_id, thread_id, etc.
         ... )
 
@@ -1174,6 +1255,7 @@ async def evaluate_ag_ui_agent(
 
     # Support both single-turn and multi-turn evaluations
     is_multi_turn = dataset.is_multi_turn()
+    prepared_metrics = _prepare_metrics_for_evaluation(metrics, is_multi_turn)
     if is_multi_turn:
         samples = t.cast(List[MultiTurnSample], dataset.samples)
     else:
@@ -1300,7 +1382,7 @@ async def evaluate_ag_ui_agent(
     # Run evaluation with metrics
     evaluation_result = ragas_evaluate(
         dataset=dataset,
-        metrics=metrics,
+        metrics=prepared_metrics,
         raise_exceptions=raise_exceptions,
         show_progress=show_progress,
         run_config=run_config or RunConfig(),