Skip to content

Commit 15da972

Browse files
guptha23Chandra Sekhar Gupta Aravapalli
andauthored
add support for reasoning models as judge for agentic evaluators. (Azure#40416)
* add support for reasoning models as judge for agentic evaluators. * removing the temporary prompty file created for reasoning models. * incorporating review comments. * updated the default tokens for reasoning models to 40000 * updated the default tokens for reasoning models to 60000 * update the doc string for is_reasoning_model parameter. * update the prompty for reasoning models in memory. * remove the method to save additional prompty file. * remove unused imports. * updating the parameters for tool call accuracy metric. --------- Co-authored-by: Chandra Sekhar Gupta Aravapalli <[email protected]>
1 parent e37eeb2 commit 15da972

File tree

9 files changed

+103
-10
lines changed

9 files changed

+103
-10
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,5 @@ class _AggregationType(enum.Enum):
9999
True: "pass",
100100
False: "fail",
101101
}
102+
103+
DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS = 60000

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import math
66
import re
7+
import os
78
from typing import Dict, TypeVar, Union
89

910
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
@@ -39,13 +40,17 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
3940
:param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
4041
Useful since some evaluators of this format are response-only.
4142
:type ignore_queries: bool
43+
:keyword is_reasoning_model: This parameter is in preview. If True, updates the config parameters in prompty file based on reasoning models. Defaults to False.
44+
:type is_reasoning_model: bool
4245
"""
4346

4447
_LLM_CALL_TIMEOUT = 600
4548
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
4649

47-
def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False, threshold: int = 3, _higher_is_better: bool = False):
50+
def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False,
51+
threshold: int = 3, _higher_is_better: bool = False, **kwargs) -> None:
4852
self._result_key = result_key
53+
self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
4954
self._prompty_file = prompty_file
5055
self._threshold = threshold
5156
self._higher_is_better = _higher_is_better
@@ -59,7 +64,8 @@ def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, ev
5964
user_agent,
6065
)
6166

62-
self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
67+
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config,
68+
is_reasoning_model=self._is_reasoning_model)
6369

6470
# __call__ not overridden here because child classes have such varied signatures that there's no point
6571
# defining a default here.

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,15 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
4747
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
4848

4949
@override
50-
def __init__(self, model_config, *, threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD):
50+
def __init__(self, model_config, *,
51+
threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD,
52+
**kwargs):
5153
current_dir = os.path.dirname(__file__)
5254
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
5355
self.threshold = threshold
54-
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
56+
super().__init__(model_config=model_config, prompty_file=prompty_path,
57+
result_key=self._RESULT_KEY,
58+
**kwargs)
5559

5660
@overload
5761
def __call__(

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,16 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
6060
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
6161

6262
@override
63-
def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD):
63+
def __init__(self, model_config, *,
64+
threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD,
65+
**kwargs):
6466
current_dir = os.path.dirname(__file__)
6567
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
6668
self.threshold = threshold
67-
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
69+
super().__init__(model_config=model_config,
70+
prompty_file=prompty_path,
71+
result_key=self._RESULT_KEY,
72+
**kwargs)
6873

6974
@overload
7075
def __call__(

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,14 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
5454
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
5555

5656
@override
57-
def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE):
57+
def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE,
58+
**kwargs):
5859
current_dir = os.path.dirname(__file__)
5960
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
6061
self.threshold = threshold
61-
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
62+
super().__init__(model_config=model_config, prompty_file=prompty_path,
63+
result_key=self._RESULT_KEY,
64+
**kwargs)
6265

6366
@overload
6467
def __call__(

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,15 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
6464
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
6565

6666
@override
67-
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE):
67+
def __init__(self, model_config, *,
68+
threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
69+
**kwargs):
6870
current_dir = os.path.dirname(__file__)
6971
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
7072
self.threshold = threshold
71-
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
73+
super().__init__(model_config=model_config, prompty_file=prompty_path,
74+
result_key=self._RESULT_KEY,
75+
**kwargs)
7276

7377
@overload
7478
def __call__(

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_legacy/prompty/_prompty.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
resolve_references,
3434
update_dict_recursively,
3535
)
36+
from azure.ai.evaluation._constants import DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS
3637
from azure.ai.evaluation._legacy._common._logging import get_logger
3738

3839

@@ -135,6 +136,18 @@ def __init__(
135136
):
136137
path = Path(path)
137138
configs, self._template = self._parse_prompty(path)
139+
140+
is_reasoning_model = kwargs.get("is_reasoning_model", False)
141+
142+
if is_reasoning_model:
143+
parameters = configs.get("model", {}).get("parameters", {})
144+
if "max_tokens" in parameters:
145+
parameters.pop("max_tokens", None)
146+
parameters["max_completion_tokens"] = DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS
147+
# Remove unsupported parameters for reasoning models
148+
for key in ["temperature", "top_p", "presence_penalty", "frequency_penalty"]:
149+
parameters.pop(key, None)
150+
138151
configs = resolve_references(configs, base_path=path.parent)
139152
configs = update_dict_recursively(configs, resolve_references(kwargs, base_path=path.parent))
140153

sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/response_completeness.ipynb

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,33 @@
128128
"result"
129129
]
130130
},
131+
{
132+
"cell_type": "markdown",
133+
"metadata": {},
134+
"source": [
135+
"#### Evaluate with a reasoning model"
136+
]
137+
},
138+
{
139+
"cell_type": "code",
140+
"execution_count": null,
141+
"metadata": {},
142+
"outputs": [],
143+
"source": [
144+
"from azure.ai.evaluation import ResponseCompletenessEvaluator , AzureOpenAIModelConfiguration\n",
145+
"from pprint import pprint\n",
146+
"\n",
147+
"# set is_reasoning_model to True in case the model is a reasoning model (ex: o3-mini, o1-preview)\n",
148+
"response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=model_config,\n",
149+
" is_reasoning_model=True)\n",
150+
"\n",
151+
"result = response_completeness_evaluator(\n",
152+
" response=\"The capital of Japan is Tokyo.\",\n",
153+
" ground_truth=\"The capital of Japan is Tokyo.\"\n",
154+
")\n",
155+
"result"
156+
]
157+
},
131158
{
132159
"cell_type": "markdown",
133160
"metadata": {},

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_completeness_evaluator.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,15 @@ def test_initialization(self, mock_model_config):
2929
# Test initialization of ResponseCompletenessEvaluator
3030
assert response_completeness_evaluator.threshold == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
3131
assert response_completeness_evaluator._result_key == ResponseCompletenessEvaluator._RESULT_KEY
32+
assert response_completeness_evaluator._is_reasoning_model is False
33+
34+
def test_initialization2(self, mock_model_config):
35+
response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=mock_model_config,
36+
is_reasoning_model=True)
37+
# Test initialization of ResponseCompletenessEvaluator
38+
assert response_completeness_evaluator.threshold == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
39+
assert response_completeness_evaluator._result_key == ResponseCompletenessEvaluator._RESULT_KEY
40+
assert response_completeness_evaluator._is_reasoning_model is True
3241

3342
def test_evaluate_completeness_valid1(self, mock_model_config):
3443
response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=mock_model_config)
@@ -67,6 +76,26 @@ def test_evaluate_completeness_valid2(self, mock_model_config):
6776
assert result[f"{key}_threshold"] == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
6877
assert "The response perfectly matches " in result[f"{key}_reason"]
6978

79+
def test_evaluate_completeness_valid3(self, mock_model_config):
80+
response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=mock_model_config,
81+
is_reasoning_model=True)
82+
response_completeness_evaluator._flow = MagicMock(return_value=completeness_response2_async_mock())
83+
84+
# Test evaluation with valid ground truth and response
85+
ground_truth = "The capital of Japan is Tokyo."
86+
response = "The capital of Japan is Tokyo."
87+
result = response_completeness_evaluator(ground_truth=ground_truth, response=response)
88+
89+
key = ResponseCompletenessEvaluator._RESULT_KEY
90+
assert result is not None
91+
92+
assert (key in result and f"{key}_result" in result and f"{key}_threshold" in result and
93+
f"{key}_reason" in result)
94+
assert result[key] == 5
95+
assert result[f"{key}_result"] == "pass"
96+
assert result[f"{key}_threshold"] == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
97+
assert "The response perfectly matches " in result[f"{key}_reason"]
98+
7099
def test_evaluate_completeness_missing_ground_truth(self, mock_model_config):
71100
response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=mock_model_config)
72101
response_completeness_evaluator._flow = MagicMock(return_value=completeness_response1_async_mock())

0 commit comments

Comments
 (0)