[evaluation] feat: Configurable credentials for aoai graders (#43026)

kdestin · web-flow · commit cb05cfebe106 · 2025-09-19T15:31:13.000-07:00
* feat: Add credential support to graders

* style: Sort imports

* test: Add test validating that graders can be evaluated with a TokenCredential

* fix: Resolve inverted validation logic for auth

* tests,fix: Update exception message

* docs: Add update docstring

* chore: Update CHANGELOG.md
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 ### Features Added
 
+- AOAI Graders now accept a "credential" parameter that can be used for authentication with an AzureOpenAIModelConfiguration
+
 ### Breaking Changes
 
 ### Bugs Fixed
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py
@@ -1,13 +1,19 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+
+from typing_extensions import TypeIs
 
-from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION
+from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION, TokenScope
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from azure.ai.evaluation._user_agent import UserAgentSingleton
-from typing import Any, Dict, Union
-from azure.ai.evaluation._common._experimental import experimental
+from azure.core.credentials import TokenCredential
+
+if TYPE_CHECKING:
+    from openai.lib.azure import AzureADTokenProvider
 
 
 @experimental
@@ -30,6 +36,8 @@ class AzureOpenAIGrader:
         to be formatted as a dictionary that matches the specifications of the sub-types of
         the TestingCriterion alias specified in (OpenAI's SDK)[https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L151].
     :type grader_config: Dict[str, Any]
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
 
@@ -43,31 +51,52 @@ def __init__(
         *,
         model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
         grader_config: Dict[str, Any],
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any,
     ):
         self._model_config = model_config
         self._grader_config = grader_config
+        self._credential = credential
 
         if kwargs.get("validate", True):
             self._validate_model_config()
             self._validate_grader_config()
 
     def _validate_model_config(self) -> None:
         """Validate the model configuration that this grader wrapper is using."""
-        if "api_key" not in self._model_config or not self._model_config.get("api_key"):
-            msg = f"{type(self).__name__}: Requires an api_key in the supplied model_config."
-            raise EvaluationException(
-                message=msg,
-                blame=ErrorBlame.USER_ERROR,
-                category=ErrorCategory.INVALID_VALUE,
-                target=ErrorTarget.AOAI_GRADER,
-            )
+        msg = None
+        if self._is_azure_model_config(self._model_config):
+            if not any(auth for auth in (self._model_config.get("api_key"), self._credential)):
+                msg = (
+                    f"{type(self).__name__}: Requires an api_key in the supplied model_config, "
+                    + "or providing a credential to the grader's __init__ method. "
+                )
+
+        else:
+            if "api_key" not in self._model_config or not self._model_config.get("api_key"):
+                msg = f"{type(self).__name__}: Requires an api_key in the supplied model_config."
+
+        if msg is None:
+            return
+
+        raise EvaluationException(
+            message=msg,
+            blame=ErrorBlame.USER_ERROR,
+            category=ErrorCategory.INVALID_VALUE,
+            target=ErrorTarget.AOAI_GRADER,
+        )
 
     def _validate_grader_config(self) -> None:
         """Validate the grader configuration that this grader wrapper is using."""
 
         return
 
+    @staticmethod
+    def _is_azure_model_config(
+        model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
+    ) -> TypeIs[AzureOpenAIModelConfiguration]:
+        return "azure_endpoint" in model_config
+
     def get_client(self) -> Any:
         """Construct an appropriate OpenAI client using this grader's model configuration.
         Returns a slightly different client depending on whether or not this grader's model
@@ -77,23 +106,38 @@ def get_client(self) -> Any:
         :rtype: [~openai.OpenAI, ~openai.AzureOpenAI]
         """
         default_headers = {"User-Agent": UserAgentSingleton().value}
-        if "azure_endpoint" in self._model_config:
+        model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration] = self._model_config
+        api_key: Optional[str] = model_config.get("api_key")
+
+        if self._is_azure_model_config(model_config):
             from openai import AzureOpenAI
 
             # TODO set default values?
             return AzureOpenAI(
-                azure_endpoint=self._model_config["azure_endpoint"],
-                api_key=self._model_config.get("api_key", None),  # Default-style access to appease linters.
+                azure_endpoint=model_config["azure_endpoint"],
+                api_key=api_key,  # Default-style access to appease linters.
                 api_version=DEFAULT_AOAI_API_VERSION,  # Force a known working version
-                azure_deployment=self._model_config.get("azure_deployment", ""),
+                azure_deployment=model_config.get("azure_deployment", ""),
+                azure_ad_token_provider=self.get_token_provider(self._credential) if not api_key else None,
                 default_headers=default_headers,
             )
         from openai import OpenAI
 
         # TODO add default values for base_url and organization?
         return OpenAI(
-            api_key=self._model_config["api_key"],
-            base_url=self._model_config.get("base_url", ""),
-            organization=self._model_config.get("organization", ""),
+            api_key=api_key,
+            base_url=model_config.get("base_url", ""),
+            organization=model_config.get("organization", ""),
             default_headers=default_headers,
         )
+
+    @staticmethod
+    def get_token_provider(cred: TokenCredential) -> "AzureADTokenProvider":
+        """Get the token provider the AzureOpenAI client.
+
+        :param TokenCredential cred: The Azure authentication credential.
+        :return: The token provider if a credential is provided, otherwise None.
+        :rtype: openai.lib.azure.AzureADTokenProvider
+        """
+
+        return lambda: cred.get_token(TokenScope.COGNITIVE_SERVICES_MANAGEMENT).token
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py
@@ -1,11 +1,13 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Any, Dict, Union, List
+from typing import Any, Dict, List, Optional, Union
 
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from openai.types.graders import LabelModelGrader
+
 from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.core.credentials import TokenCredential
 
 from .aoai_grader import AzureOpenAIGrader
 
@@ -37,6 +39,8 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
     :type name: str
     :param passing_labels: The labels that indicate a passing result. Must be a subset of labels.
     :type passing_labels: List[str]
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
 
@@ -54,6 +58,7 @@ def __init__(
         model: str,
         name: str,
         passing_labels: List[str],
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any
     ):
         grader = LabelModelGrader(
@@ -64,4 +69,4 @@ def __init__(
             passing_labels=passing_labels,
             type="label_model",
         )
-        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
+        super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/python_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/python_grader.py
@@ -1,11 +1,13 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Any, Dict, Union, Optional
+from typing import Any, Dict, Optional, Union
 
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from openai.types.graders import PythonGrader
+
 from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.core.credentials import TokenCredential
 
 from .aoai_grader import AzureOpenAIGrader
 
@@ -39,6 +41,8 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
     :param source: Python source code containing the grade function.
         Must define: def grade(sample: dict, item: dict) -> float
     :type source: str
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
 
@@ -63,6 +67,7 @@ def __init__(
         image_tag: str,
         pass_threshold: float,
         source: str,
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any,
     ):
         # Validate pass_threshold
@@ -81,4 +86,4 @@ def __init__(
             type="python",
         )
 
-        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
+        super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py
@@ -1,11 +1,13 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Any, Dict, Union, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from openai.types.graders import ScoreModelGrader
+
 from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.core.credentials import TokenCredential
 
 from .aoai_grader import AzureOpenAIGrader
 
@@ -43,6 +45,8 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
     :type pass_threshold: Optional[float]
     :param sampling_params: The sampling parameters for the model.
     :type sampling_params: Optional[Dict[str, Any]]
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
     """
@@ -59,6 +63,7 @@ def __init__(
         range: Optional[List[float]] = None,
         pass_threshold: Optional[float] = None,
         sampling_params: Optional[Dict[str, Any]] = None,
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any,
     ):
         # Validate range and pass_threshold
@@ -88,4 +93,4 @@ def __init__(
 
         grader = ScoreModelGrader(**grader_kwargs)
 
-        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
+        super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py
@@ -1,12 +1,14 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Any, Dict, Union
-from typing_extensions import Literal
+from typing import Any, Dict, Optional, Union
 
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from openai.types.graders import StringCheckGrader
+from typing_extensions import Literal
+
 from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.core.credentials import TokenCredential
 
 from .aoai_grader import AzureOpenAIGrader
 
@@ -33,6 +35,8 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
     :type operation: Literal["eq", "ne", "like", "ilike"]
     :param reference: The reference text. This may include template strings.
     :type reference: str
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
 
@@ -54,6 +58,7 @@ def __init__(
             "ilike",
         ],
         reference: str,
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any
     ):
         grader = StringCheckGrader(
@@ -63,4 +68,4 @@ def __init__(
             reference=reference,
             type="string_check",
         )
-        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
+        super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py
@@ -1,12 +1,14 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Any, Dict, Union
-from typing_extensions import Literal
+from typing import Any, Dict, Optional, Union
 
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from openai.types.graders import TextSimilarityGrader
+from typing_extensions import Literal
+
 from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.core.credentials import TokenCredential
 
 from .aoai_grader import AzureOpenAIGrader
 
@@ -47,6 +49,8 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
     :type reference: str
     :param name: The name of the grader.
     :type name: str
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
 
@@ -76,6 +80,7 @@ def __init__(
         pass_threshold: float,
         reference: str,
         name: str,
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any
     ):
         grader = TextSimilarityGrader(
@@ -86,4 +91,4 @@ def __init__(
             reference=reference,
             type="text_similarity",
         )
-        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
+        super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_aoai_graders.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_aoai_graders.py
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_integration_features.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_integration_features.py