[evaluation] chore: Enable tests on 3.13, disable tests on 3.14 (#43362)

kdestin · web-flow · commit ee73a6e67e7c · 2025-10-15T17:30:37.000-07:00
* chore: Specify correct lower bound for pands on 3.14

* chore: Enable 3.13

* chore: Disable 3.14

* chore: Add 3.13 tests

* chore: Move red-team dep install to dev-requirements

* docs: Fix misc rst formatting issues

* docs,fix: Remove admonitions with no matching blocks

* fix: Add environment marker for redteam extra

    pyrit only supports python3.10 and up

* chore: Bump min boudns for pyrit

* fix: Fix pandas min bound for 3.13
diff --git a/eng/tools/azure-sdk-tools/ci_tools/functions.py b/eng/tools/azure-sdk-tools/ci_tools/functions.py
@@ -55,7 +55,7 @@
     "sdk/textanalytics/azure-ai-textanalytics",
 ]
 
-TEST_COMPATIBILITY_MAP = {"azure-ai-ml": ">=3.7", "azure-ai-evaluation": ">=3.9, !=3.13.*"}
+TEST_COMPATIBILITY_MAP = {"azure-ai-ml": ">=3.7"}
 TEST_PYTHON_DISTRO_INCOMPATIBILITY_MAP = {
     "azure-storage-blob": "pypy",
     "azure-storage-queue": "pypy",
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py
@@ -18,8 +18,9 @@
 
 @experimental
 class AzureOpenAIGrader:
-    """
-    Base class for Azure OpenAI grader wrappers, recommended only for use by experienced OpenAI API users.
+    """Base class for Azure OpenAI grader wrappers.
+
+    Recommended only for use by experienced OpenAI API users.
     Combines a model configuration and any grader configuration
     into a singular object that can be used in evaluations.
 
@@ -28,20 +29,16 @@ class AzureOpenAIGrader:
     evaluation results.
 
     :param model_config: The model configuration to use for the grader.
-    :type model_config: Union[
-        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
-        ~azure.ai.evaluation.OpenAIModelConfiguration
-    ]
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param grader_config: The grader configuration to use for the grader. This is expected
         to be formatted as a dictionary that matches the specifications of the sub-types of
-        the TestingCriterion alias specified in (OpenAI's SDK)[https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L151].
+        the TestingCriterion alias specified in `OpenAI's SDK <https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L151>`_.
     :type grader_config: Dict[str, Any]
     :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
     :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
-
-
     """
 
     id = "azureai://built-in/evaluators/azure-openai/custom_grader"
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py
@@ -14,21 +14,18 @@
 
 @experimental
 class AzureOpenAILabelGrader(AzureOpenAIGrader):
-    """
-    Wrapper class for OpenAI's label model graders.
+    """Wrapper class for OpenAI's label model graders.
 
     Supplying a LabelGrader to the `evaluate` method will cause an asynchronous request to evaluate
     the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
     evaluation results.
 
     :param model_config: The model configuration to use for the grader.
-    :type model_config: Union[
-        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
-        ~azure.ai.evaluation.OpenAIModelConfiguration
-    ]
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param input: The list of label-based testing criterion for this grader. Individual
         values of this list are expected to be dictionaries that match the format of any of the valid
-        (TestingCriterionLabelModelInput)[https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L125C1-L125C32]
+        `TestingCriterionLabelModelInput <https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L125C1-L125C32>`_
         subtypes.
     :type input: List[Dict[str, str]]
     :param labels: A list of strings representing the classification labels of this grader.
@@ -43,8 +40,6 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
     :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
-
-
     """
 
     id = "azureai://built-in/evaluators/azure-openai/label_grader"
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/python_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/python_grader.py
@@ -14,8 +14,7 @@
 
 @experimental
 class AzureOpenAIPythonGrader(AzureOpenAIGrader):
-    """
-    Wrapper class for OpenAI's Python code graders.
+    """Wrapper class for OpenAI's Python code graders.
 
     Enables custom Python-based evaluation logic with flexible scoring and
     pass/fail thresholds. The grader executes user-provided Python code
@@ -27,16 +26,13 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
     evaluation results.
 
     :param model_config: The model configuration to use for the grader.
-    :type model_config: Union[
-        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
-        ~azure.ai.evaluation.OpenAIModelConfiguration
-    ]
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param name: The name of the grader.
     :type name: str
     :param image_tag: The image tag for the Python execution environment.
     :type image_tag: str
-    :param pass_threshold: Score threshold for pass/fail classification.
-        Scores >= threshold are considered passing.
+    :param pass_threshold: Score threshold for pass/fail classification. Scores >= threshold are considered passing.
     :type pass_threshold: float
     :param source: Python source code containing the grade function.
         Must define: def grade(sample: dict, item: dict) -> float
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py
@@ -14,8 +14,7 @@
 
 @experimental
 class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
-    """
-    Wrapper class for OpenAI's score model graders.
+    """Wrapper class for OpenAI's score model graders.
 
     Enables continuous scoring evaluation with custom prompts and flexible
     conversation-style inputs. Supports configurable score ranges and
@@ -27,10 +26,8 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
     evaluation results.
 
     :param model_config: The model configuration to use for the grader.
-    :type model_config: Union[
-        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
-        ~azure.ai.evaluation.OpenAIModelConfiguration
-    ]
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param input: The input messages for the grader. List of conversation
         messages with role and content.
     :type input: List[Dict[str, str]]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py
@@ -15,18 +15,14 @@
 
 @experimental
 class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
-    """
-    Wrapper class for OpenAI's string check graders.
+    """Wrapper class for OpenAI's string check graders.
 
     Supplying a StringCheckGrader to the `evaluate` method will cause an asynchronous request to evaluate
     the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
     evaluation results.
 
     :param model_config: The model configuration to use for the grader.
-    :type model_config: Union[
-        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
-        ~azure.ai.evaluation.OpenAIModelConfiguration
-    ]
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,~azure.ai.evaluation.OpenAIModelConfiguration]
     :param input: The input text. This may include template strings.
     :type input: str
     :param name: The name of the grader.
@@ -39,8 +35,6 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
     :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
-
-
     """
 
     id = "azureai://built-in/evaluators/azure-openai/string_check_grader"
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py
@@ -15,32 +15,19 @@
 
 @experimental
 class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
-    """
-    Wrapper class for OpenAI's string check graders.
+    """Wrapper class for OpenAI's string check graders.
 
     Supplying a StringCheckGrader to the `evaluate` method will cause an asynchronous request to evaluate
     the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
     evaluation results.
 
     :param model_config: The model configuration to use for the grader.
     :type model_config: Union[
-        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
-        ~azure.ai.evaluation.OpenAIModelConfiguration
-    ]
+            ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+            ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param evaluation_metric: The evaluation metric to use.
-    :type evaluation_metric: Literal[
-            "fuzzy_match",
-            "bleu",
-            "gleu",
-            "meteor",
-            "rouge_1",
-            "rouge_2",
-            "rouge_3",
-            "rouge_4",
-            "rouge_5",
-            "rouge_l",
-            "cosine",
-        ]
+    :type evaluation_metric: Literal["fuzzy_match", "bleu", "gleu", "meteor", "rouge_1", "rouge_2", "rouge_3",
+        "rouge_4", "rouge_5", "rouge_l", "cosine"]
     :param input: The text being graded.
     :type input: str
     :param pass_threshold: A float score where a value greater than or equal indicates a passing grade.
@@ -53,8 +40,6 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
     :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
-
-
     """
 
     id = "azureai://built-in/evaluators/azure-openai/text_similarity_grader"
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py
@@ -46,6 +46,7 @@ class BleuScoreEvaluator(EvaluatorBase):
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
 
     .. admonition:: Example with Threshold:
+
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
             :start-after: [START threshold_bleu_score_evaluator]
             :end-before: [END threshold_bleu_score_evaluator]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py
@@ -56,23 +56,6 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
 
-    .. admonition:: Example:
-
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START code_vulnerability_evaluator]
-            :end-before: [END code_vulnerability_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
-
-        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
-            :start-after: [START code_vulnerability_evaluator]
-            :end-before: [END code_vulnerability_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
-                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
-
     .. note::
 
         If this evaluator is supplied to the `evaluate` function, the metric
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -33,8 +33,7 @@ def value(self) -> str:
 
 
 class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
-    """
-    Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
+    """Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
     including reasoning.
 
     The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
@@ -66,6 +65,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :caption: Initialize and call a GroundednessEvaluator.
 
     .. admonition:: Example with Threshold:
+
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
             :start-after: [START threshold_groundedness_evaluator]
             :end-before: [END threshold_groundedness_evaluator]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py
@@ -17,36 +17,32 @@
 
 @experimental
 class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
-    """
-    Evaluates the extent to which a given response contains all necessary and relevant information with respect to the
-     provided ground truth.
+    """Evaluates the extent to which a given response contains all necessary and relevant information with respect to the
+    provided ground truth.
+
     The completeness measure assesses how thoroughly an AI model's generated response aligns with the key information,
     claims, and statements established in the ground truth. This evaluation considers the presence, accuracy,
     and relevance of the content provided.
+
     The assessment spans multiple levels, ranging from fully incomplete to fully complete, ensuring a comprehensive
     evaluation of the response's content quality.
+
     Use this metric when you need to evaluate an AI model's ability to deliver comprehensive and accurate information,
     particularly in text generation tasks where conveying all essential details is crucial for clarity,
     context, and correctness.
+
     Completeness scores range from 1 to 5:
+
     1: Fully incomplete — Contains none of the necessary information.
     2: Barely complete — Contains only a small portion of the required information.
     3: Moderately complete — Covers about half of the required content.
     4: Mostly complete — Includes most of the necessary details with minimal omissions.
     5: Fully complete — Contains all key information without any omissions.
+
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
 
-    .. admonition:: Example:
-
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START completeness_evaluator]
-            :end-before: [END completeness_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
-
     .. admonition:: Example using Azure AI Project URL:
 
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py
@@ -40,6 +40,7 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         ~azure.ai.evaluation.OpenAIModelConfiguration]
 
     .. admonition:: Example:
+
         .. literalinclude:: ../samples/evaluation_samples_evaluate.py
             :start-after: [START task_adherence_evaluator]
             :end-before: [END task_adherence_evaluator]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py
@@ -33,25 +33,6 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
 
-    .. admonition:: Example:
-
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START ungrounded_attributes_evaluator]
-            :end-before: [END ungrounded_attributes_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
-
-    .. admonition:: Example using Azure AI Project URL:
-
-        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
-            :start-after: [START ungrounded_attributes_evaluator]
-            :end-before: [END ungrounded_attributes_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
-                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
-
     .. note::
 
         If this evaluator is supplied to the `evaluate` function, the metric
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team_result.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team_result.py
@@ -517,21 +517,23 @@ def to_scorecard(self) -> Optional[RedTeamingScorecard]:
         return self.scan_result.get("scorecard", None) if self.scan_result else None
 
     def to_eval_qr_json_lines(self) -> str:
-        """
-        Converts conversations in messages format to query-response format suitable for evaluation.
+        """Converts conversations in messages format to query-response format suitable for evaluation.
 
         The output format follows the JSONL pattern with each line containing:
-        {
-            "query": "user message content",
-            "response": "assistant message content",
-            "risk_category": "risk category",
-            "attack_strategy": "strategy name",
-            "attack_complexity": "complexity level",
-            "attack_success": "true|false", (if available from evaluation)
-            "category": "risk category", (if available from evaluation)
-            "severity_level": "low|medium|high", (if available from evaluation)
-            "threshold": "threshold value" (if available from evaluation)
-        }
+
+        .. code-block:: javascript
+
+            {
+                "query": "user message content",
+                "response": "assistant message content",
+                "risk_category": "risk category",
+                "attack_strategy": "strategy name",
+                "attack_complexity": "complexity level",
+                "attack_success": "true|false", // (if available from evaluation)
+                "category": "risk category", // (if available from evaluation)
+                "severity_level": "low|medium|high", // (if available from evaluation)
+                "threshold": "threshold value" // (if available from evaluation)
+            }
 
         :returns: A list of strings containing query-response pairs in JSONL format.
         :rtype: List[str]
diff --git a/sdk/evaluation/azure-ai-evaluation/dev_requirements.txt b/sdk/evaluation/azure-ai-evaluation/dev_requirements.txt
@@ -12,4 +12,4 @@ aiohttp
 filelock
 promptflow-core>=1.17.1
 promptflow-devkit>=1.17.1
--e ../azure-ai-evaluation
+-e ../azure-ai-evaluation[redteam]
diff --git a/sdk/evaluation/azure-ai-evaluation/setup.py b/sdk/evaluation/azure-ai-evaluation/setup.py
diff --git a/sdk/evaluation/ci.yml b/sdk/evaluation/ci.yml
diff --git a/sdk/evaluation/platform-matrix.json b/sdk/evaluation/platform-matrix.json

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@`
`55`	`55`	`"sdk/textanalytics/azure-ai-textanalytics",`
`56`	`56`	`]`
`57`	`57`
`58`		`-TEST_COMPATIBILITY_MAP = {"azure-ai-ml": ">=3.7", "azure-ai-evaluation": ">=3.9, !=3.13.*"}`
	`58`	`+TEST_COMPATIBILITY_MAP = {"azure-ai-ml": ">=3.7"}`
`59`	`59`	`TEST_PYTHON_DISTRO_INCOMPATIBILITY_MAP = {`
`60`	`60`	`"azure-storage-blob": "pypy",`
`61`	`61`	`"azure-storage-queue": "pypy",`