jeremydvoss
diff --git a/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 7 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py‎
Lines changed: 5 additions & 7 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py‎
Lines changed: 70 additions & 22 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py‎
Lines changed: 70 additions & 22 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py‎
Lines changed: 41 additions & 5 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py‎
Lines changed: 41 additions & 5 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_legacy/_batch_engine/_config.py‎
Lines changed: 6 additions & 3 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_legacy/_batch_engine/_config.py‎
Lines changed: 6 additions & 3 deletions
@@ -21,6 +21,13 @@ tolerance for harmful responses).
 - Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
 
 
+### Other Changes
+
+- The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
+  - Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
+    This is due to be removed in a future release.
+
+
 ## 1.9.0 (2025-07-02)
 
 ### Features Added
 
@@ -6,11 +6,11 @@
 import re
 import math
 import threading
-from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
+from typing import Any, List, Literal, Mapping, Optional, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
 
 import nltk
 from azure.storage.blob import ContainerClient
-from typing_extensions import NotRequired, Required, TypeGuard
+from typing_extensions import NotRequired, Required, TypeGuard, TypeIs
 from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
 from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
 from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
@@ -127,17 +127,15 @@ def construct_prompty_model_config(
     return prompty_model_config
 
 
-def is_onedp_project(azure_ai_project: AzureAIProject) -> bool:
+def is_onedp_project(azure_ai_project: Optional[Union[str, AzureAIProject]]) -> TypeIs[str]:
     """Check if the Azure AI project is an OneDP project.
 
     :param azure_ai_project: The scope of the Azure AI project.
-    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :type azure_ai_project: Optional[Union[str,~azure.ai.evaluation.AzureAIProject]]
     :return: True if the Azure AI project is an OneDP project, False otherwise.
     :rtype: bool
     """
-    if isinstance(azure_ai_project, str):
-        return True
-    return False
+    return isinstance(azure_ai_project, str)
 
 
 def validate_azure_ai_project(o: object) -> AzureAIProject:
 
@@ -6,6 +6,7 @@
 import logging
 import pandas as pd
 import sys
+import itertools
 from collections import defaultdict
 from concurrent.futures import Future
 from os import PathLike
@@ -16,15 +17,34 @@
 from ..._legacy._batch_engine._config import BatchEngineConfig
 from ..._legacy._batch_engine._run import Run
 from ..._legacy._adapters._constants import LINE_NUMBER
+from ..._legacy._adapters.types import AttrDict
 from ..._legacy._common._thread_pool_executor_with_context import ThreadPoolExecutorWithContext
+from ..._evaluate._utils import _has_aggregator
+from ..._constants import Prefixes, PF_BATCH_TIMEOUT_SEC
 
+from .._utils import get_int_env_var as get_int
 
-LOGGER = logging.getLogger(__name__)
+
+LOGGER = logging.getLogger("run")
+MISSING_VALUE: Final[int] = sys.maxsize
 
 
 class RunSubmitterClient:
-    def __init__(self, config: Optional[BatchEngineConfig] = None) -> None:
-        self._config = config or BatchEngineConfig(LOGGER, use_async=True)
+    def __init__(self, *, raise_on_errors: bool = False, config: Optional[BatchEngineConfig] = None) -> None:
+        if config:
+            self._config = config
+        else:
+            # Generate default config and apply any overrides to the configuration from environment variables
+            self._config = BatchEngineConfig(LOGGER, use_async=True)
+            if (val := get_int(PF_BATCH_TIMEOUT_SEC, MISSING_VALUE)) != MISSING_VALUE:
+                self._config.batch_timeout_seconds = val
+            if (val := get_int("PF_LINE_TIMEOUT_SEC", MISSING_VALUE)) != MISSING_VALUE:
+                self._config.line_timeout_seconds = val
+            if (val := get_int("PF_WORKER_COUNT", MISSING_VALUE)) != MISSING_VALUE:
+                self._config.max_concurrency = val
+
+        self._config.raise_on_error = raise_on_errors
+
         self._thread_pool = ThreadPoolExecutorWithContext(
             thread_name_prefix="evaluators_thread", max_workers=self._config.max_concurrency
         )
@@ -72,32 +92,60 @@ def run(
         return run_future
 
     def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
-
         run = self._get_run(client_run)
 
-        data: Dict[str, List[Any]] = defaultdict(list)
-        stop_at: Final[int] = self._config.default_num_results if not all_results else sys.maxsize
+        def concat(*dataframes: pd.DataFrame) -> pd.DataFrame:
+            return pd.concat(dataframes, axis=1, verify_integrity=True)
 
-        def _update(prefix: str, items: Sequence[Mapping[str, Any]]) -> None:
-            for i, line in enumerate(items):
-                if i >= stop_at:
-                    break
-                for k, value in line.items():
-                    key = f"{prefix}.{k}"
-                    data[key].append(value)
+        def to_dataframe(items: Sequence[Mapping[str, Any]], *, max_length: Optional[int] = None) -> pd.DataFrame:
+            """Convert a sequence of dictionaries to a DataFrame.
 
-        # Go from a list of dictionaries (i.e. a row view of the data) to a dictionary of lists
-        # (i.e. a column view of the data)
-        _update("inputs", run.inputs)
-        _update("inputs", [{LINE_NUMBER: i} for i in range(len(run.inputs))])
-        _update("outputs", run.outputs)
+            :param items: Sequence of dictionaries to convert.
+            :type items: Sequence[Mapping[str, Any]]
+            :param max_length: Maximum number of items to include in the DataFrame. If None, include all items.
+            :type max_length: Optional[int]
+            :return: DataFrame containing the items.
+            :rtype: pd.DataFrame
+            """
+            max_length = None if all_results else self._config.default_num_results
+            return pd.DataFrame(data=items if all_results else itertools.islice(items, max_length))
 
-        df = pd.DataFrame(data).reindex(columns=[k for k in data.keys()])
-        return df
+        inputs = concat(
+            to_dataframe(run.inputs), to_dataframe([{LINE_NUMBER: i} for i in range(len(run.inputs))])
+        ).add_prefix(Prefixes.INPUTS)
+
+        outputs = to_dataframe(run.outputs).add_prefix(Prefixes.OUTPUTS)
+
+        return concat(inputs, outputs)
 
     def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
         run = self._get_run(client_run)
-        return dict(run.metrics)
+        return {**run.metrics, **self._get_aggregated_metrics(client_run)}
+
+    def _get_aggregated_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
+        aggregated_metrics = None
+        run = self._get_run(client_run)
+        try:
+            if _has_aggregator(run.dynamic_callable):
+                result_df = pd.DataFrame(run.outputs)
+                if len(result_df.columns) == 1 and result_df.columns[0] == "output":
+                    aggregate_input = result_df["output"].tolist()
+                else:
+                    aggregate_input = [AttrDict(item) for item in result_df.to_dict("records")]
+
+                aggr_func = getattr(run.dynamic_callable, "__aggregate__")
+                aggregated_metrics = aggr_func(aggregate_input)
+
+        except Exception as ex:  # pylint: disable=broad-exception-caught
+            LOGGER.warning("Error calculating aggregations for evaluator, failed with error %s", ex)
+
+        if not isinstance(aggregated_metrics, dict):
+            LOGGER.warning(
+                "Aggregated metrics for evaluator is not a dictionary will not be logged as metrics",
+            )
+            return {}
+
+        return aggregated_metrics
 
     def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
         run = self._get_run(client_run)
@@ -110,7 +158,7 @@ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
             "duration": str(run.duration),
             "completed_lines": total_lines - failed_lines,
             "failed_lines": failed_lines,
-            # "log_path": "",
+            "log_path": None,
         }
 
     @staticmethod
 
@@ -9,7 +9,7 @@
 import re
 import tempfile
 import json
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
+from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
 
 from openai import OpenAI, AzureOpenAI
 from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
@@ -876,6 +876,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
         output_path=output_path,
         azure_ai_project=azure_ai_project,
         evaluation_name=evaluation_name,
+        fail_on_evaluator_errors=fail_on_evaluator_errors,
         **kwargs,
     )
 
@@ -983,6 +984,7 @@ def _preprocess_data(
     output_path: Optional[Union[str, os.PathLike]] = None,
     azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
     evaluation_name: Optional[str] = None,
+    fail_on_evaluator_errors: bool = False,
     **kwargs,
 ) -> __ValidatedData:
     # Process evaluator config to replace ${target.} with ${data.}
@@ -1016,15 +1018,49 @@ def _preprocess_data(
     batch_run_client: BatchClient
     batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
 
-    if kwargs.pop("_use_run_submitter_client", False):
-        batch_run_client = RunSubmitterClient()
+    def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
+        """Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
+        _use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
+        _use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
+
+        if _use_run_submitter_client is None and _use_pf_client is None:
+            # If both are unset, return default
+            return "run_submitter"
+
+        if _use_run_submitter_client and _use_pf_client:
+            raise EvaluationException(
+                message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
+                target=ErrorTarget.EVALUATE,
+                category=ErrorCategory.INVALID_VALUE,
+                blame=ErrorBlame.USER_ERROR,
+            )
+
+        if _use_run_submitter_client == False and _use_pf_client == False:
+            return "code_client"
+
+        if _use_run_submitter_client:
+            return "run_submitter"
+        if _use_pf_client:
+            return "pf_client"
+
+        if _use_run_submitter_client is None and _use_pf_client == False:
+            return "run_submitter"
+        if _use_run_submitter_client == False and _use_pf_client is None:
+            return "pf_client"
+
+        assert False, "This should be impossible"
+
+    client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
+
+    if client_type == "run_submitter":
+        batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
         batch_run_data = input_data_df
-    elif kwargs.pop("_use_pf_client", True):
+    elif client_type == "pf_client":
         batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
         # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
         # multiple evaluators. If the path is already absolute, abspath will return the original path.
         batch_run_data = os.path.abspath(data)
-    else:
+    elif client_type == "code_client":
         batch_run_client = CodeClient()
         batch_run_data = input_data_df
 
 
@@ -19,7 +19,7 @@ class BatchEngineConfig:
     batch_timeout_seconds: int = PF_BATCH_TIMEOUT_SEC_DEFAULT
     """The maximum amount of time to wait for all evaluations in the batch to complete."""
 
-    run_timeout_seconds: int = 600
+    line_timeout_seconds: int = 600
     """The maximum amount of time to wait for an evaluation to run against a single entry
     in the data input to complete."""
 
@@ -32,13 +32,16 @@ class BatchEngineConfig:
     default_num_results: int = 100
     """The default number of results to return if you don't ask for all results."""
 
+    raise_on_error: bool = True
+    """Whether to raise an error if an evaluation fails."""
+
     def __post_init__(self):
         if self.logger is None:
             raise ValueError("logger cannot be None")
         if self.batch_timeout_seconds <= 0:
             raise ValueError("batch_timeout_seconds must be greater than 0")
-        if self.run_timeout_seconds <= 0:
-            raise ValueError("run_timeout_seconds must be greater than 0")
+        if self.line_timeout_seconds <= 0:
+            raise ValueError("line_timeout_seconds must be greater than 0")
         if self.max_concurrency <= 0:
             raise ValueError("max_concurrency must be greater than 0")
         if self.default_num_results <= 0: