add support for reasoning models as judge for agentic evaluators. (Azure#40416)

guptha23 · Chandra Sekhar Gupta Aravapalli · web-flow · commit 15da972aca3e · 2025-04-13T23:40:33.000-07:00
* add support for reasoning models as judge for agentic evaluators.

* removing the temporary prompty file created for reasoning models.

* incorporating review comments.

* updated the default tokens for reasoning models to 40000

* updated the default tokens for reasoning models to 60000

* update the doc string for is_reasoning_model parameter.

* update the prompty for reasoning models in memory.

* remove the method to save additional prompty file.

* remove unused imports.

* updating the parameters for tool call accuracy metric.

---------

Co-authored-by: Chandra Sekhar Gupta Aravapalli &lt;caravapalli@microsoft.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py
@@ -99,3 +99,5 @@ class _AggregationType(enum.Enum):
     True: "pass",
     False: "fail",
 }
+
+DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS = 60000
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -4,6 +4,7 @@
 
 import math
 import re
+import os
 from typing import Dict, TypeVar, Union
 
 from azure.ai.evaluation._legacy.prompty import AsyncPrompty
@@ -39,13 +40,17 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
     :param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
         Useful since some evaluators of this format are response-only.
     :type ignore_queries: bool
+    :keyword is_reasoning_model: This parameter is in preview. If True, updates the config parameters in prompty file based on reasoning models. Defaults to False.
+    :type is_reasoning_model: bool
     """
 
     _LLM_CALL_TIMEOUT = 600
     _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
 
-    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False, threshold: int = 3, _higher_is_better: bool = False):
+    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False,
+                 threshold: int = 3, _higher_is_better: bool = False, **kwargs) -> None:
         self._result_key = result_key
+        self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
         self._prompty_file = prompty_file
         self._threshold = threshold
         self._higher_is_better = _higher_is_better
@@ -59,7 +64,8 @@ def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, ev
             user_agent,
         )
 
-        self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
+        self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config,
+                                       is_reasoning_model=self._is_reasoning_model)
 
     # __call__ not overridden here because child classes have such varied signatures that there's no point
     # defining a default here.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py
@@ -47,11 +47,15 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
     @override
-    def __init__(self, model_config, *, threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD):
+    def __init__(self, model_config, *,
+                 threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD,
+                 **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+        super().__init__(model_config=model_config, prompty_file=prompty_path,
+                         result_key=self._RESULT_KEY,
+                         **kwargs)
 
     @overload
     def __call__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py
@@ -60,11 +60,16 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
     @override
-    def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD):
+    def __init__(self, model_config, *,
+                 threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD,
+                 **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+        super().__init__(model_config=model_config,
+                         prompty_file=prompty_path,
+                         result_key=self._RESULT_KEY,
+                         **kwargs)
 
     @overload
     def __call__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py
@@ -54,11 +54,14 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE):
+    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE,
+                 **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+        super().__init__(model_config=model_config, prompty_file=prompty_path,
+                         result_key=self._RESULT_KEY,
+                         **kwargs)
 
     @overload
     def __call__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -64,11 +64,15 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE):
+    def __init__(self, model_config, *,
+                 threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
+                 **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+        super().__init__(model_config=model_config, prompty_file=prompty_path,
+                         result_key=self._RESULT_KEY,
+                         **kwargs)
 
     @overload
     def __call__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_legacy/prompty/_prompty.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_legacy/prompty/_prompty.py
@@ -33,6 +33,7 @@
     resolve_references,
     update_dict_recursively,
 )
+from azure.ai.evaluation._constants import DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS
 from azure.ai.evaluation._legacy._common._logging import get_logger
 
 
@@ -135,6 +136,18 @@ def __init__(
     ):
         path = Path(path)
         configs, self._template = self._parse_prompty(path)
+
+        is_reasoning_model = kwargs.get("is_reasoning_model", False)
+
+        if is_reasoning_model:
+            parameters = configs.get("model", {}).get("parameters", {})
+            if "max_tokens" in parameters:
+                parameters.pop("max_tokens", None)
+                parameters["max_completion_tokens"] = DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS
+            # Remove unsupported parameters for reasoning models
+            for key in ["temperature", "top_p", "presence_penalty", "frequency_penalty"]:
+                parameters.pop(key, None)
+
         configs = resolve_references(configs, base_path=path.parent)
         configs = update_dict_recursively(configs, resolve_references(kwargs, base_path=path.parent))
 
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/response_completeness.ipynb b/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/response_completeness.ipynb
@@ -128,6 +128,33 @@
     "result"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Evaluate with a reasoning model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.evaluation import ResponseCompletenessEvaluator , AzureOpenAIModelConfiguration\n",
+    "from pprint import pprint\n",
+    "\n",
+    "# set is_reasoning_model to True in case the model is a reasoning model (ex: o3-mini, o1-preview)\n",
+    "response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=model_config,\n",
+    "                                                                is_reasoning_model=True)\n",
+    "\n",
+    "result = response_completeness_evaluator(\n",
+    "    response=\"The capital of Japan is Tokyo.\",\n",
+    "    ground_truth=\"The capital of Japan is Tokyo.\"\n",
+    ")\n",
+    "result"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_completeness_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_completeness_evaluator.py
@@ -29,6 +29,15 @@ def test_initialization(self, mock_model_config):
         # Test initialization of ResponseCompletenessEvaluator
         assert response_completeness_evaluator.threshold == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
         assert response_completeness_evaluator._result_key == ResponseCompletenessEvaluator._RESULT_KEY
+        assert response_completeness_evaluator._is_reasoning_model is False
+
+    def test_initialization2(self, mock_model_config):
+        response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=mock_model_config,
+                                                                        is_reasoning_model=True)
+        # Test initialization of ResponseCompletenessEvaluator
+        assert response_completeness_evaluator.threshold == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
+        assert response_completeness_evaluator._result_key == ResponseCompletenessEvaluator._RESULT_KEY
+        assert response_completeness_evaluator._is_reasoning_model is True
 
     def test_evaluate_completeness_valid1(self, mock_model_config):
         response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=mock_model_config)
@@ -67,6 +76,26 @@ def test_evaluate_completeness_valid2(self, mock_model_config):
         assert result[f"{key}_threshold"] == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
         assert "The response perfectly matches " in result[f"{key}_reason"]
 
+    def test_evaluate_completeness_valid3(self, mock_model_config):
+        response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=mock_model_config,
+                                                                        is_reasoning_model=True)
+        response_completeness_evaluator._flow = MagicMock(return_value=completeness_response2_async_mock())
+
+        # Test evaluation with valid ground truth and response
+        ground_truth = "The capital of Japan is Tokyo."
+        response = "The capital of Japan is Tokyo."
+        result = response_completeness_evaluator(ground_truth=ground_truth, response=response)
+
+        key = ResponseCompletenessEvaluator._RESULT_KEY
+        assert result is not None
+
+        assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result and
+                f"{key}_reason" in result)
+        assert result[key] == 5
+        assert result[f"{key}_result"] == "pass"
+        assert result[f"{key}_threshold"] == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
+        assert "The response perfectly matches " in result[f"{key}_reason"]
+
     def test_evaluate_completeness_missing_ground_truth(self, mock_model_config):
         response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=mock_model_config)
         response_completeness_evaluator._flow = MagicMock(return_value=completeness_response1_async_mock())

Original file line number	Diff line number	Diff line change
`@@ -99,3 +99,5 @@ class _AggregationType(enum.Enum):`
`99`	`99`	`True: "pass",`
`100`	`100`	`False: "fail",`
`101`	`101`	`}`
	`102`	`+`
	`103`	`+DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS = 60000`