azure-sdk
diff --git a/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/assets.json‎
Lines changed: 1 addition & 1 deletion b/‎sdk/evaluation/azure-ai-evaluation/assets.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/__init__.py‎
Lines changed: 10 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/__init__.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py‎
Lines changed: 106 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py‎
Lines changed: 65 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py‎
Lines changed: 64 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py‎
Lines changed: 87 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py‎
Lines changed: 5 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py‎
Lines changed: 5 additions & 0 deletions
@@ -4,6 +4,11 @@
 
 ### Features Added
 - New `<evaluator>.binary_aggregate` field added to evaluation result metrics. This field contains the aggregated binary evaluation results for each evaluator, providing a summary of the evaluation outcomes.
+- Added support for Azure Open AI evaluation via 4 new 'grader' classes, which serve as wrappers around Azure Open AI grader configurations. These new grader objects can be supplied to the main `evaluate` method as if they were normal callable evaluators. The new classes are:
+    - AzureOpenAIGrader (general class for experienced users)
+    - AzureOpenAILabelGrader
+    - AzureOpenAIStringCheckGrader
+    - AzureOpenAITextSimilarityGrader
 
 ### Breaking Changes
 
 
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_e33b6c53d7"
+  "Tag": "python/evaluation/azure-ai-evaluation_497634c2bf"
 }
@@ -40,6 +40,11 @@
     Message,
     OpenAIModelConfiguration,
 )
+from ._aoai.aoai_grader import AzureOpenAIGrader
+from ._aoai.label_grader import AzureOpenAILabelGrader
+from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader
+from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader
+
 
 _patch_all = []
 
@@ -89,6 +94,10 @@
     "CodeVulnerabilityEvaluator",
     "UngroundedAttributesEvaluator",
     "ToolCallAccuracyEvaluator",
+    "AzureOpenAIGrader",
+    "AzureOpenAILabelGrader",
+    "AzureOpenAIStringCheckGrader",
+    "AzureOpenAITextSimilarityGrader",
 ]
 
 __all__.extend([p for p in _patch_all if p not in __all__])
@@ -0,0 +1,10 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+
+from .aoai_grader import AzureOpenAIGrader
+
+__all__ = [
+    "AzureOpenAIGrader",
+]
@@ -0,0 +1,106 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+
+from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from typing import Any, Dict, Union
+from azure.ai.evaluation._common._experimental import experimental
+
+
+@experimental
+class AzureOpenAIGrader():
+    """
+    Base class for Azure OpenAI grader wrappers, recommended only for use by experienced OpenAI API users.
+    Combines a model configuration and any grader configuration
+    into a singular object that can be used in evaluations.
+
+    Supplying an AzureOpenAIGrader to the `evaluate` method will cause an asynchronous request to evaluate
+    the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
+    evaluation results.
+
+    :param model_config: The model configuration to use for the grader.
+    :type model_config: Union[
+        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration
+    ]
+    :param grader_config: The grader configuration to use for the grader. This is expected
+        to be formatted as a dictionary that matches the specifications of the sub-types of
+        the TestingCriterion alias specified in (OpenAI's SDK)[https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L151].
+    :type grader_config: Dict[str, Any]
+    :param kwargs: Additional keyword arguments to pass to the grader.
+    :type kwargs: Dict[str, Any]
+
+
+    """
+
+    id = "aoai://general"
+
+    def __init__(self, model_config : Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], grader_config: Dict[str, Any], **kwargs: Dict[str, Any]):        
+        self._model_config = model_config
+        self._grader_config = grader_config
+
+        if kwargs.get("validate", True):
+            self._validate_model_config()
+            self._validate_grader_config()
+
+
+
+    def _validate_model_config(self) -> None:
+        """Validate the model configuration that this grader wrapper is using."""
+        if "api_key" not in self._model_config or not self._model_config.get("api_key"):
+            msg = f"{type(self).__name__}: Requires an api_key in the supplied model_config."
+            raise EvaluationException(
+                message=msg,
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.AOAI_GRADER,
+            )
+    
+    def _validate_grader_config(self) -> None:
+        """Validate the grader configuration that this grader wrapper is using."""
+
+        return
+
+
+    def get_model_config(self) -> AzureOpenAIModelConfiguration:
+        """Get the model configuration that this grader wrapper is using.
+
+        :return: The model configuration.
+        :rtype: AzureOpenAIModelConfiguration
+        """
+        return self._model_config
+    
+    def get_grader_config(self) -> Any:
+        """Get the grader configuration that this grader wrapper is using.
+
+        :return: The grader configuration.
+        :rtype: Any
+        """
+        return self._grader_config
+
+    def get_client(self) -> Any:
+        """Construct an appropriate OpenAI client using this grader's model configuration.
+        Returns a slightly different client depending on whether or not this grader's model
+        configuration is for Azure OpenAI or OpenAI.
+
+        :return: The OpenAI client.
+        :rtype: [~openai.OpenAI, ~openai.AzureOpenAI]
+        """
+        if "azure_endpoint" in self._model_config:
+           from openai import AzureOpenAI
+           # TODO set default values?
+           return AzureOpenAI(
+                azure_endpoint=self._model_config["azure_endpoint"],
+                api_key=self._model_config.get("api_key", None), # Default-style access to appease linters.
+                api_version=self._model_config.get("api_version", DEFAULT_AOAI_API_VERSION),
+                azure_deployment=self._model_config.get("azure_deployment", ""),
+            )
+        from openai import OpenAI
+        # TODO add default values for base_url and organization?
+        return OpenAI(
+            api_key=self._model_config["api_key"],
+            base_url=self._model_config.get("base_url", ""),
+            organization=self._model_config.get("organization", ""),
+        )
@@ -0,0 +1,65 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import Any, Dict, Union, List
+
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from openai.types.eval_create_params import TestingCriterionLabelModel
+from azure.ai.evaluation._common._experimental import experimental
+
+from .aoai_grader import AzureOpenAIGrader
+
+@experimental
+class AzureOpenAILabelGrader(AzureOpenAIGrader):
+    """
+    Wrapper class for OpenAI's label model graders.
+
+    Supplying a LabelGrader to the `evaluate` method will cause an asynchronous request to evaluate
+    the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
+    evaluation results.
+
+    :param model_config: The model configuration to use for the grader.
+    :type model_config: Union[
+        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration
+    ]
+    :param input: The list of label-based testing criterion for this grader. Individual
+        values of this list are expected to be dictionaries that match the format of any of the valid
+        (TestingCriterionLabelModelInput)[https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L125C1-L125C32]
+        subtypes.
+    :type input: List[Dict[str, str]]
+    :param labels: A list of strings representing the classification labels of this grader.
+    :type labels: List[str]
+    :param model: The model to use for the evaluation. Must support structured outputs.
+    :type model: str
+    :param name: The name of the grader.
+    :type name: str
+    :param passing_labels: The labels that indicate a passing result. Must be a subset of labels.
+    :type passing_labels: List[str]
+    :param kwargs: Additional keyword arguments to pass to the grader.
+    :type kwargs: Dict[str, Any]
+
+
+    """
+
+    id = "aoai://label_model"
+
+    def __init__(
+        self,
+        model_config : Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
+        input: List[Dict[str, str]],
+        labels: List[str],
+        model: str,
+        name: str,
+        passing_labels: List[str],
+        **kwargs: Dict[str, Any]
+    ):
+        grader = TestingCriterionLabelModel(
+            input=input,
+            labels=labels,
+            model=model,
+            name=name,
+            passing_labels=passing_labels,
+            type="label_model",
+        )
+        super().__init__(model_config, grader, **kwargs)
@@ -0,0 +1,64 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import Any, Dict, Union
+from typing_extensions import Literal
+
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from openai.types.eval_string_check_grader import EvalStringCheckGrader
+from azure.ai.evaluation._common._experimental import experimental
+
+from .aoai_grader import AzureOpenAIGrader
+
+@experimental
+class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
+    """
+    Wrapper class for OpenAI's string check graders.
+
+    Supplying a StringCheckGrader to the `evaluate` method will cause an asynchronous request to evaluate
+    the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
+    evaluation results.
+
+    :param model_config: The model configuration to use for the grader.
+    :type model_config: Union[
+        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration
+    ]
+    :param input: The input text. This may include template strings.
+    :type input: str
+    :param name: The name of the grader.
+    :type name: str
+    :param operation: The string check operation to perform. One of `eq`, `ne`, `like`, or `ilike`.
+    :type operation: Literal["eq", "ne", "like", "ilike"]
+    :param reference: The reference text. This may include template strings.
+    :type reference: str
+    :param kwargs: Additional keyword arguments to pass to the grader.
+    :type kwargs: Dict[str, Any]
+
+
+    """
+
+    id = "aoai://string_check"
+
+    def __init__(
+        self,
+        model_config : Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
+        input: str,
+        name: str,
+        operation: Literal[
+            "eq",
+            "ne",
+            "like",
+            "ilike",
+        ],
+        reference: str,
+        **kwargs: Dict[str, Any]
+    ):
+        grader = EvalStringCheckGrader(
+            input=input,
+            name=name,
+            operation=operation,
+            reference=reference,
+            type="string_check",
+        )
+        super().__init__(model_config, grader, **kwargs)
@@ -0,0 +1,87 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import Any, Dict, Union
+from typing_extensions import Literal
+
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from openai.types.eval_text_similarity_grader import EvalTextSimilarityGrader
+from azure.ai.evaluation._common._experimental import experimental
+
+from .aoai_grader import AzureOpenAIGrader
+
+@experimental
+class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
+    """
+    Wrapper class for OpenAI's string check graders.
+
+    Supplying a StringCheckGrader to the `evaluate` method will cause an asynchronous request to evaluate
+    the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
+    evaluation results.
+
+    :param model_config: The model configuration to use for the grader.
+    :type model_config: Union[
+        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration
+    ]
+    :param evaluation_metric: The evaluation metric to use.
+    :type evaluation_metric: Literal[
+            "fuzzy_match",
+            "bleu",
+            "gleu",
+            "meteor",
+            "rouge_1",
+            "rouge_2",
+            "rouge_3",
+            "rouge_4",
+            "rouge_5",
+            "rouge_l",
+            "cosine",
+        ]
+    :param input: The text being graded.
+    :type input: str
+    :param pass_threshold: A float score where a value greater than or equal indicates a passing grade.
+    :type pass_threshold: float
+    :param reference: The text being graded against.
+    :type reference: str
+    :param name: The name of the grader.
+    :type name: str
+    :param kwargs: Additional keyword arguments to pass to the grader.
+    :type kwargs: Dict[str, Any]
+
+
+    """
+
+    id = "aoai://text_similarity"
+
+    def __init__(
+        self,
+        model_config : Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
+        evaluation_metric: Literal[
+            "fuzzy_match",
+            "bleu",
+            "gleu",
+            "meteor",
+            "rouge_1",
+            "rouge_2",
+            "rouge_3",
+            "rouge_4",
+            "rouge_5",
+            "rouge_l",
+            "cosine",
+        ],
+        input: str,
+        pass_threshold: float,
+        reference: str,
+        name: str,
+        **kwargs: Dict[str, Any]
+    ):
+        grader = EvalTextSimilarityGrader(
+            evaluation_metric=evaluation_metric,
+            input=input,
+            pass_threshold=pass_threshold,
+            name=name,
+            reference=reference,
+            type="text_similarity",
+        )
+        super().__init__(model_config, grader, **kwargs)
@@ -62,6 +62,7 @@ class EvaluationRunProperties:
     RUN_TYPE = "runType"
     EVALUATION_RUN = "_azureml.evaluation_run"
     EVALUATION_SDK = "_azureml.evaluation_sdk_name"
+    NAME_MAP = "_azureml.evaluation_name_map"
 
 
 @experimental
@@ -102,3 +103,7 @@ class _AggregationType(enum.Enum):
 
 DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS = 60000
 BINARY_AGGREGATE_SUFFIX = "binary_aggregate"
+
+AOAI_COLUMN_NAME = "aoai"
+DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
+DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,5 @@`
`2`	`2`	`"AssetsRepo": "Azure/azure-sdk-assets",`
`3`	`3`	`"AssetsRepoPrefixPath": "python",`
`4`	`4`	`"TagPrefix": "python/evaluation/azure-ai-evaluation",`
`5`		`- "Tag": "python/evaluation/azure-ai-evaluation_e33b6c53d7"`
	`5`	`+ "Tag": "python/evaluation/azure-ai-evaluation_497634c2bf"`
`6`	`6`	`}`