Skip to content

Commit 7be9178

Browse files
kdestinCopilotsingankit
authored
[evaluation] feat: User Configurable Credentials for Prompty-based Evaluators (#42549)
* feat: Make TokenCredential configurable for prompty based evaluators * fix,tests: Remove unused invalid import * tests: Add a test for using credential with prompty based evaluator * docs: Add changelog entry * Update sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py Co-authored-by: Copilot <[email protected]> * refactor: Move credential to be a parameter to __init__ * chore: Update assets.json * Update sdk/evaluation/azure-ai-evaluation/CHANGELOG.md Co-authored-by: Ankit Singhal <[email protected]> --------- Co-authored-by: Copilot <[email protected]> Co-authored-by: Ankit Singhal <[email protected]>
1 parent eee418f commit 7be9178

File tree

15 files changed

+74
-18
lines changed

15 files changed

+74
-18
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
### Features Added
66
- Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
7+
- Added support for user-supplied TokenCredentials with LLM based evaluators.
78
- Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
89
- Added `language` parameter to `RedTeam` class for multilingual red team scanning support. The parameter accepts values from `SupportedLanguages` enum including English, Spanish, French, German, Italian, Portuguese, Japanese, Korean, and Simplified Chinese, enabling red team attacks to be generated and conducted in multiple languages.
910
- Added support for XPIA and UngroundedAttributes risk categories in `RedTeam` scanning. These new risk categories expand red team capabilities to detect cross-platform indirect attacks and evaluate ungrounded inferences about human attributes including emotional state and protected class information.

sdk/evaluation/azure-ai-evaluation/assets.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"AssetsRepo": "Azure/azure-sdk-assets",
33
"AssetsRepoPrefixPath": "python",
44
"TagPrefix": "python/evaluation/azure-ai-evaluation",
5-
"Tag": "python/evaluation/azure-ai-evaluation_468882d958"
6-
}
5+
"Tag": "python/evaluation/azure-ai-evaluation_e9fbe5cd65"
6+
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
6666
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
6767

6868
@override
69-
def __init__(self, model_config, *, threshold=3):
69+
def __init__(self, model_config, *, threshold=3, credential=None):
7070
current_dir = os.path.dirname(__file__)
7171
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
7272
self._threshold = threshold
@@ -76,6 +76,7 @@ def __init__(self, model_config, *, threshold=3):
7676
prompty_file=prompty_path,
7777
result_key=self._RESULT_KEY,
7878
threshold=threshold,
79+
credential=credential,
7980
_higher_is_better=self._higher_is_better,
8081
)
8182

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@
55
import math
66
import re
77
import os
8-
from typing import Dict, TypeVar, Union
8+
from typing import Dict, Optional, TypeVar, Union
99

1010
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
1111
from promptflow.core._flow import AsyncPrompty
1212
else:
1313
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
1414
from typing_extensions import override
1515

16+
from azure.core.credentials import TokenCredential
1617
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
1718
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
1819
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
@@ -63,6 +64,7 @@ def __init__(
6364
model_config: dict,
6465
eval_last_turn: bool = False,
6566
threshold: int = 3,
67+
credential: Optional[TokenCredential] = None,
6668
_higher_is_better: bool = False,
6769
**kwargs,
6870
) -> None:
@@ -82,7 +84,10 @@ def __init__(
8284
)
8385

8486
self._flow = AsyncPrompty.load(
85-
source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model
87+
source=self._prompty_file,
88+
model=prompty_model_config,
89+
token_credential=credential,
90+
is_reasoning_model=self._is_reasoning_model,
8691
)
8792

8893
# __call__ not overridden here because child classes have such varied signatures that there's no point

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
6868
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
6969

7070
@override
71-
def __init__(self, model_config, *, threshold=3):
71+
def __init__(self, model_config, *, credential=None, threshold=3):
7272
current_dir = os.path.dirname(__file__)
7373
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
7474
self._threshold = threshold
@@ -78,6 +78,7 @@ def __init__(self, model_config, *, threshold=3):
7878
prompty_file=prompty_path,
7979
result_key=self._RESULT_KEY,
8080
threshold=threshold,
81+
credential=credential,
8182
_higher_is_better=self._higher_is_better,
8283
)
8384

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
9494
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
9595

9696
@override
97-
def __init__(self, model_config, *, threshold=3, **kwargs):
97+
def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
9898
current_dir = os.path.dirname(__file__)
9999
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query
100100

@@ -104,6 +104,7 @@ def __init__(self, model_config, *, threshold=3, **kwargs):
104104
prompty_file=prompty_path,
105105
result_key=self._RESULT_KEY,
106106
threshold=threshold,
107+
credential=credential,
107108
_higher_is_better=self._higher_is_better,
108109
)
109110
self._model_config = model_config

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,17 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
6161
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
6262

6363
@override
64-
def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
64+
def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, credential=None, **kwargs):
6565
current_dir = os.path.dirname(__file__)
6666
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
6767
self.threshold = threshold
68-
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
68+
super().__init__(
69+
model_config=model_config,
70+
prompty_file=prompty_path,
71+
result_key=self._RESULT_KEY,
72+
credential=credential,
73+
**kwargs,
74+
)
6975

7076
@overload
7177
def __call__(

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
7979
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
8080

8181
@override
82-
def __init__(self, model_config, *, threshold=3):
82+
def __init__(self, model_config, *, credential=None, threshold=3):
8383
current_dir = os.path.dirname(__file__)
8484
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
8585
self._threshold = threshold
@@ -89,6 +89,7 @@ def __init__(self, model_config, *, threshold=3):
8989
prompty_file=prompty_path,
9090
result_key=self._RESULT_KEY,
9191
threshold=threshold,
92+
credential=credential,
9293
_higher_is_better=self._higher_is_better,
9394
)
9495

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,19 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
7373
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
7474

7575
@override
76-
def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, **kwargs):
76+
def __init__(
77+
self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, credential=None, **kwargs
78+
):
7779
current_dir = os.path.dirname(__file__)
7880
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
7981
self.threshold = threshold
80-
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
82+
super().__init__(
83+
model_config=model_config,
84+
prompty_file=prompty_path,
85+
result_key=self._RESULT_KEY,
86+
credential=credential,
87+
**kwargs,
88+
)
8189

8290
@overload
8391
def __call__(

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
7878
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
7979

8080
@override
81-
def __init__(self, model_config, *, threshold: float = 3): # pylint: disable=super-init-not-called
81+
def __init__(self, model_config, *, threshold: float = 3, credential=None):
8282
current_dir = os.path.dirname(__file__)
8383
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
8484
self._threshold = threshold
@@ -88,6 +88,7 @@ def __init__(self, model_config, *, threshold: float = 3): # pylint: disable=su
8888
prompty_file=prompty_path,
8989
result_key=self._RESULT_KEY,
9090
threshold=threshold,
91+
credential=credential,
9192
_higher_is_better=self._higher_is_better,
9293
)
9394

0 commit comments

Comments
 (0)